diff --git a/docs/regressions-wikipedia-dpr-100w-bm25.md b/docs/regressions-wikipedia-dpr-100w-bm25.md new file mode 100644 index 000000000..092657da9 --- /dev/null +++ b/docs/regressions-wikipedia-dpr-100w-bm25.md @@ -0,0 +1,143 @@ +# Anserini Regressions: QA with wikipedia-dpr-100w Corpus + +**Models**: BM25 + +This page documents QA regression experiments on the wikipedia-dpr-100w corpus, which is integrated into Anserini's regression testing framework. + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/wikipedia-dpr-100w-bm25.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/wikipedia-dpr-100w-bm25.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --convert --regression wikipedia-dpr-100w-bm25 +``` + +## Indexing + +Typical indexing command: + +```bash +target/appassembler/bin/IndexCollection \ + -collection JsonCollection \ + -input /path/to/wikipedia-dpr-100w \ + -index indexes/lucene-index.wikipedia-dpr-100w/ \ + -generator DefaultLuceneDocumentGenerator \ + -threads 43 -storeRaw \ + >& logs/log.wikipedia-dpr-100w & +``` + +The directory `/path/to/wikipedia-dpr-100w/`should be a directory containing the wikipedia-dpr-100w passages collection retrieved from [here](https://dl.fbaipublicfiles.com/dpr/wikipedia_split/psgs_w100.tsv.gz). + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +Topics are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/). +The regression experiments here evaluate on the test set of multiple QA datasets, namely Natural Questions, TriviaQA, SQuAD, and WebQuestions. + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.wikipedia-dpr-100w/ \ + -topics src/main/resources/topics-and-qrels/topics.dpr.nq.test.txt \ + -topicreader DprNq \ + -output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.nq.test.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.wikipedia-dpr-100w/ \ + -topics src/main/resources/topics-and-qrels/topics.dpr.trivia.test.txt \ + -topicreader DprNq \ + -output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.trivia.test.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.wikipedia-dpr-100w/ \ + -topics src/main/resources/topics-and-qrels/topics.dpr.squad.test.txt \ + -topicreader DprJsonl \ + -output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.squad.test.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.wikipedia-dpr-100w/ \ + -topics src/main/resources/topics-and-qrels/topics.dpr.wq.test.txt \ + -topicreader DprJsonl \ + -output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.wq.test.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.wikipedia-dpr-100w/ \ + -topics src/main/resources/topics-and-qrels/topics.nq.test.txt \ + -topicreader DprNq \ + -output runs/run.wikipedia-dpr-100w.bm25.topics.nq.test.txt \ + -bm25 & +``` + +The trec format will need to be converted to DPR's JSON format for evaluation: +```bash +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index indexes/lucene-index.wikipedia-dpr-100w/ \ + --topics dpr-nq-test \ + --input runs/run.wikipedia-dpr-100w.bm25.topics.dpr.nq.test.txt \ + --output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.nq.test.txt.json \ + & +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index indexes/lucene-index.wikipedia-dpr-100w/ \ + --topics dpr-trivia-test \ + --input runs/run.wikipedia-dpr-100w.bm25.topics.dpr.trivia.test.txt \ + --output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.trivia.test.txt.json \ + & +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index indexes/lucene-index.wikipedia-dpr-100w/ \ + --topics dpr-squad-test \ + --input runs/run.wikipedia-dpr-100w.bm25.topics.dpr.squad.test.txt \ + --output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.squad.test.txt.json \ + & +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index indexes/lucene-index.wikipedia-dpr-100w/ \ + --topics dpr-wq-test \ + --input runs/run.wikipedia-dpr-100w.bm25.topics.dpr.wq.test.txt \ + --output runs/run.wikipedia-dpr-100w.bm25.topics.dpr.wq.test.txt.json \ + & +python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \ + --index indexes/lucene-index.wikipedia-dpr-100w/ \ + --topics nq-test \ + --input runs/run.wikipedia-dpr-100w.bm25.topics.nq.test.txt \ + --output runs/run.wikipedia-dpr-100w.bm25.topics.nq.test.txt.json \ + & +``` + +Evaluation can be performed using scripts from pyserini: + +```bash +python -m pyserini.eval.evaluate_dpr_retrieval --topk 20 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.nq.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 100 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.nq.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 20 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.trivia.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 100 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.trivia.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 20 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.squad.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 100 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.squad.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 20 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.wq.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 100 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.dpr.wq.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 20 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.nq.test.txt.json +python -m pyserini.eval.evaluate_dpr_retrieval --topk 100 --retrieval runs/run.wikipedia-dpr-100w.bm25.topics.nq.test.txt.json +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **top_20_accuracy** | **BM25 (default parameters)**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DPR: Natural Questions Test](https://github.com/facebookresearch/DPR) | 0.6294 | +| [DPR: TriviaQA Test](https://github.com/facebookresearch/DPR) | 0.7641 | +| [DPR: SQuAD Test](https://github.com/facebookresearch/DPR) | 0.7109 | +| [DPR: WebQuestions Test](https://github.com/facebookresearch/DPR) | 0.6240 | +| [EfficientQA: Natural Questions Test](https://efficientqa.github.io/) | 0.6399 | +| **top_100_accuracy** | **BM25 (default parameters)**| +| [DPR: Natural Questions Test](https://github.com/facebookresearch/DPR) | 0.7825 | +| [DPR: TriviaQA Test](https://github.com/facebookresearch/DPR) | 0.8315 | +| [DPR: SQuAD Test](https://github.com/facebookresearch/DPR) | 0.8184 | +| [DPR: WebQuestions Test](https://github.com/facebookresearch/DPR) | 0.7549 | +| [EfficientQA: Natural Questions Test](https://efficientqa.github.io/) | 0.7922 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/wikipedia-dpr-100w-bm25.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py index b7f4127cd..8918c03a4 100644 --- a/src/main/python/run_regression.py +++ b/src/main/python/run_regression.py @@ -128,14 +128,27 @@ def construct_search_commands(yaml_data): SEARCH_COMMAND, '-index', construct_index_path(yaml_data), '-topics', os.path.join(yaml_data['topic_root'], topic_set['path']), - '-topicreader', yaml_data['topic_reader'], + '-topicreader', topic_set['topic_reader'] if 'topic_reader' in topic_set and topic_set['topic_reader'] else yaml_data['topic_reader'], '-output', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']), - model['params'] + model['params'], ] for (model, topic_set) in list(itertools.product(yaml_data['models'], yaml_data['topics'])) ] return ranking_commands +def construct_convert_commands(yaml_data): + converting_commands = [ + [ + conversion['command'], + '--index', construct_index_path(yaml_data), + '--topics', topic_set['id'], + '--input', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + conversion['in_file_ext'], + '--output', construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + conversion['out_file_ext'], + conversion['params'] if 'params' in conversion and conversion['params'] else '', + ] + for (model, topic_set, conversion) in list(itertools.product(yaml_data['models'], yaml_data['topics'], yaml_data['conversions'])) + ] + return converting_commands def evaluate_and_verify(yaml_data, dry_run): fail_str = '\033[91m[FAIL]\033[0m ' @@ -148,8 +161,8 @@ def evaluate_and_verify(yaml_data, dry_run): for metric in yaml_data['metrics']: eval_cmd = [ os.path.join(metric['command']), metric['params'] if 'params' in metric and metric['params'] else '', - os.path.join(yaml_data['qrels_root'], topic_set['qrel']), - construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + os.path.join(yaml_data['qrels_root'], topic_set['qrel']) if 'qrel' in topic_set and topic_set['qrel'] else '', + construct_runfile_path(yaml_data['corpus'], topic_set['id'], model['name']) + (yaml_data['conversions'][-1]['out_file_ext'] if 'conversions' in yaml_data and yaml_data['conversions'][-1]['out_file_ext'] else '') ] if dry_run: logger.info(' '.join(eval_cmd)) @@ -181,6 +194,9 @@ def run_search(cmd): logger.info(' '.join(cmd)) call(' '.join(cmd), shell=True) +def run_convert(cmd): + logger.info(' '.join(cmd)) + call(' '.join(cmd), shell=True) # https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 class TqdmUpTo(tqdm): @@ -259,6 +275,10 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb parser.add_argument('--search', dest='search', action='store_true', help='Search and verify results.') parser.add_argument('--search-pool', dest='search_pool', type=int, default=4, help='Number of ranking runs to execute in parallel.') + parser.add_argument('--convert', dest='convert', action='store_true', + help='convert TREC output format to DPR\'s json format for QA.') + parser.add_argument('--convert-pool', dest='convert_pool', type=int, default=4, + help='Number of converting runs to execute in parallel.') parser.add_argument('--dry-run', dest='dry_run', action='store_true', help='Output commands without actual execution.') args = parser.parse_args() @@ -329,4 +349,14 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb with Pool(args.search_pool) as p: p.map(run_search, search_cmds) + if args.convert: + logger.info('='*10 + ' Converting ' + '='*10) + convert_cmds = construct_convert_commands(yaml_data) + if args.dry_run: + for cmd in convert_cmds: + logger.info(' '.join(cmd)) + else: + with Pool(args.convert_pool) as p: + p.map(run_convert, convert_cmds) + evaluate_and_verify(yaml_data, args.dry_run) diff --git a/src/main/resources/docgen/templates/wikipedia-dpr-100w-bm25.template b/src/main/resources/docgen/templates/wikipedia-dpr-100w-bm25.template new file mode 100644 index 000000000..2e27c8423 --- /dev/null +++ b/src/main/resources/docgen/templates/wikipedia-dpr-100w-bm25.template @@ -0,0 +1,58 @@ +# Anserini Regressions: QA with wikipedia-dpr-100w Corpus + +**Models**: BM25 + +This page documents QA regression experiments on the wikipedia-dpr-100w corpus, which is integrated into Anserini's regression testing framework. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --convert --regression ${test_name} +``` + +## Indexing + +Typical indexing command: + +```bash +${index_cmds} +``` + +The directory `/path/to/${corpus}/`should be a directory containing the wikipedia-dpr-100w passages collection retrieved from [here](https://dl.fbaipublicfiles.com/dpr/wikipedia_split/psgs_w100.tsv.gz). + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +Topics are stored in [`src/main/resources/topics-and-qrels/`](../src/main/resources/topics-and-qrels/). +The regression experiments here evaluate on the test set of multiple QA datasets, namely Natural Questions, TriviaQA, SQuAD, and WebQuestions. + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +The trec format will need to be converted to DPR's JSON format for evaluation: +```bash +${converting_cmds} +``` + +Evaluation can be performed using scripts from pyserini: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/regression/wikipedia-dpr-100w-bm25.yaml b/src/main/resources/regression/wikipedia-dpr-100w-bm25.yaml new file mode 100644 index 000000000..eb49c6a59 --- /dev/null +++ b/src/main/resources/regression/wikipedia-dpr-100w-bm25.yaml @@ -0,0 +1,77 @@ +--- +corpus: wikipedia-dpr-100w +corpus_path: /store/collections/wikipedia/wikipedia-dpr-100w + +index_path: indexes/lucene-index.wikipedia-dpr-100w/ +collection_class: JsonCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 43 +index_options: -storeRaw +index_stats: + documents: 21015324 + documents (non-empty): 21015324 + total terms: 1512973270 + +conversions: + - command: python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run + params: + in_file_ext: "" + out_file_ext: .json + +metrics: + - metric: top_20_accuracy + command: python -m pyserini.eval.evaluate_dpr_retrieval + params: --topk 20 --retrieval + separator: " " + parse_index: 1 + metric_precision: 4 + can_combine: false + - metric: top_100_accuracy + command: python -m pyserini.eval.evaluate_dpr_retrieval + params: --topk 100 --retrieval + separator: " " + parse_index: 1 + metric_precision: 4 + can_combine: false + +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: +topics: + - name: "[DPR: Natural Questions Test](https://github.com/facebookresearch/DPR)" + id: dpr-nq-test + path: topics.dpr.nq.test.txt + topic_reader: DprNq + - name: "[DPR: TriviaQA Test](https://github.com/facebookresearch/DPR)" + id: dpr-trivia-test + path: topics.dpr.trivia.test.txt + topic_reader: DprNq + - name: "[DPR: SQuAD Test](https://github.com/facebookresearch/DPR)" + id: dpr-squad-test + path: topics.dpr.squad.test.txt + topic_reader: DprJsonl + - name: "[DPR: WebQuestions Test](https://github.com/facebookresearch/DPR)" + id: dpr-wq-test + path: topics.dpr.wq.test.txt + topic_reader: DprJsonl + - name: "[EfficientQA: Natural Questions Test](https://efficientqa.github.io/)" + id: nq-test + path: topics.nq.test.txt + topic_reader: DprNq + +models: + - name: bm25 + display: BM25 (default parameters) + params: -bm25 + results: + top_20_accuracy: + - 0.6294 + - 0.7641 + - 0.7109 + - 0.6240 + - 0.6399 + top_100_accuracy: + - 0.7825 + - 0.8315 + - 0.8184 + - 0.7549 + - 0.7922 \ No newline at end of file diff --git a/src/test/java/io/anserini/doc/DataModel.java b/src/test/java/io/anserini/doc/DataModel.java index 8660bbb04..2fe77203c 100755 --- a/src/test/java/io/anserini/doc/DataModel.java +++ b/src/test/java/io/anserini/doc/DataModel.java @@ -161,6 +161,7 @@ public void setQrels_root(String qrels_root) { private List metrics; private List models; private List topics; + private List conversions; public List getMetrics() { return metrics; @@ -186,11 +187,20 @@ public void setModels(List models) { this.models = models; } + public List getConversions() { + return conversions; + } + + public void setConversions(List conversions) { + this.conversions = conversions; + } + static class Topic { private String name; private String id; private String path; private String qrel; + private String topic_reader; public String getName() { return name; } public void setName(String name) { this.name = name; } @@ -200,6 +210,8 @@ static class Topic { public void setPath(String path) { this.path = path; } public String getQrel() { return qrel; } public void setQrel(String qrel) { this.qrel = qrel; } + public String getTopic_reader() { return topic_reader; } + public void setTopic_reader(String topic_reader) { this.topic_reader = topic_reader; } } static class Model { @@ -218,6 +230,22 @@ static class Model { public void setParams(String params) { this.params = params; } } + static class Conversion { + private String command; + private String in_file_ext; + private String out_file_ext; + private String params; + + public String getCommand() { return command; } + public void setCommand(String command) { this.command = command; } + public String getIn_file_ext() { return in_file_ext; } + public void setIn_file_ext(String in_file_ext) { this.in_file_ext = in_file_ext; } + public String getOut_file_ext() { return out_file_ext; } + public void setOut_file_ext(String out_file_ext) { this.out_file_ext = out_file_ext; } + public String getParams() { return params; } + public void setParams(String params) { this.params = params; } + } + static class Metric { private String command; private String params; @@ -281,7 +309,7 @@ public String generateRankingCommand(String collection) { builder.append(SEARCH_COMMAND).append(" \\\n"); builder.append(" -index").append(" ").append(getIndex_path()).append(" \\\n"); builder.append(" -topics").append(" ").append(Paths.get(getTopic_root(), topic.getPath()).toString()).append(" \\\n"); - builder.append(" -topicreader").append(" ").append(getTopic_reader()).append(" \\\n"); + builder.append(" -topicreader").append(" ").append((topic.getTopic_reader() == null) ? getTopic_reader() : topic.getTopic_reader()).append(" \\\n"); builder.append(" -output").append(" ").append(generateRunFile(collection, model, topic)).append(" \\\n"); if (model.getParams() != null) { builder.append(" ").append(model.getParams()); @@ -295,6 +323,31 @@ public String generateRankingCommand(String collection) { return builder.toString().trim(); } + public String generateConvertingCommand(String collection) { + StringBuilder builder = new StringBuilder(); + if(getConversions() != null){ + for(Conversion conversion : getConversions()) { + for (Model model : getModels()) { + for (Topic topic : getTopics()) { + builder.append(conversion.getCommand()).append(" \\\n"); + builder.append(" --index").append(" ").append(getIndex_path()).append(" \\\n"); + builder.append(" --topics").append(" ").append(topic.getId()).append(" \\\n"); + builder.append(" --input").append(" ").append(generateRunFile(collection, model, topic) + ((conversion.getIn_file_ext() == null) ? "" : conversion.getIn_file_ext())).append(" \\\n"); + builder.append(" --output").append(" ").append(generateRunFile(collection, model, topic) + conversion.getOut_file_ext()).append(" \\\n"); + if (conversion.getParams() != null) { + builder.append(" ").append(conversion.getParams()); + } + builder.append(" &"); // nohup + builder.append("\n"); + } + builder.append("\n"); + } + } + } + + return builder.toString().trim(); + } + public String generateEvalCommand(String collection) { StringBuilder builder = new StringBuilder(); for (Model model : getModels()) { @@ -307,8 +360,15 @@ public String generateEvalCommand(String collection) { evalCmdOption += " " + eval.getParams(); } String evalCmdResidual = ""; - evalCmdResidual += " " + Paths.get(getQrels_root(), topic.getQrel()); + if(topic.getQrel() != null){ + evalCmdResidual += " " + Paths.get(getQrels_root(), topic.getQrel()); + } evalCmdResidual += " " + generateRunFile(collection, model, topic); + List conversions = getConversions(); + if(conversions != null){ + Conversion lastConversion = conversions.get(conversions.size() - 1); + evalCmdResidual += lastConversion.getOut_file_ext(); + } evalCmdResidual += "\n"; if (eval.isCan_combine() || evalCmdOption.isEmpty()) { combinedEvalCmd.putIfAbsent(evalCmd, new HashMap<>()); diff --git a/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java b/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java index 803dba189..8070c45f5 100755 --- a/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java +++ b/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java @@ -51,6 +51,7 @@ public void main() throws Exception { valuesMap.put("corpus", corpus); valuesMap.put("index_cmds", data.generateIndexingCommand(corpus)); valuesMap.put("ranking_cmds", data.generateRankingCommand(corpus)); + valuesMap.put("converting_cmds", data.generateConvertingCommand(corpus)); valuesMap.put("eval_cmds", data.generateEvalCommand(corpus)); valuesMap.put("effectiveness", data.generateEffectiveness(corpus));