From 259d8ecedbba833386f9300a2667ef61b20943d8 Mon Sep 17 00:00:00 2001
From: Kelvin Jiang <20145768+kelvin-jiang@users.noreply.github.com>
Date: Sun, 30 May 2021 16:33:37 -0400
Subject: [PATCH] Fixed bug in FEVER experiment regarding trec_eval params
 (#1552)

---
 docs/experiments-fever.md          | 10 +++++-----
 src/main/python/fever/tune_bm25.py |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/experiments-fever.md b/docs/experiments-fever.md
index 1bd03bb5b2..35e195263d 100644
--- a/docs/experiments-fever.md
+++ b/docs/experiments-fever.md
@@ -119,10 +119,10 @@ This run produces the following results:
 |:----|----------------:|----------------:|
 | 1   | 0.3887          | 0.5925          |
 | 5   | 0.6517          | 0.7678          |
-| 10  |	0.7349          | 0.8233          |
-| 25  |	0.8117          | 0.8745          |
-| 50  |	0.8570          | 0.9047          |
-| 100 |	0.8900          | 0.9267          |
+| 10  | 0.7349          | 0.8233          |
+| 25  | 0.8117          | 0.8745          |
+| 50  | 0.8570          | 0.9047          |
+| 100 | 0.8900          | 0.9267          |
 
 Note that this outperforms the TF-IDF baseline in the FEVER paper at every value of k.
 
@@ -158,7 +158,7 @@ python src/main/python/fever/tune_bm25.py \
  --runs_folder runs/fever-bm25 \
  --index_folder indexes/fever/lucene-index-fever-paragraph \
  --queries_file collections/fever/queries.paragraph.train-subset.tsv \
- --qrels_file collections/fever/qrels.paragraph.train-subset.tsv
+ --qrels_file collections/fever/qrels.paragraph.train-subset.txt
 ```
 
 From the grid search, we observe that the parameters `k1=0.9`, `b=0.1` perform fairly well. If we retrieve on the dev set with these parameters:
diff --git a/src/main/python/fever/tune_bm25.py b/src/main/python/fever/tune_bm25.py
index 07da6ab90f..d76800f8bf 100644
--- a/src/main/python/fever/tune_bm25.py
+++ b/src/main/python/fever/tune_bm25.py
@@ -44,10 +44,10 @@ def evaluate_runs(args):
         run_file = os.path.join(args.runs_folder, file)
         # evaluate with trec_eval
         results = subprocess.check_output(['tools/eval/trec_eval.9.0.4/trec_eval',
+                                           '-mrecall.100',
+                                           '-mmap',
                                            args.qrels_file,
-                                           run_file,
-                                           '-m recall.100',
-                                           '-m map'])
+                                           run_file])
         # regex match trec_eval output to get metrics
         match = re.search('map +\tall\t([0-9.]+)', results.decode('utf-8'))
         map = float(match.group(1))