forked from microsoft/msmarco
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQnA.py
212 lines (181 loc) · 8.42 KB
/
QnA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
This module computes evaluation metrics for MS MaRCo data set.
For first time execution, please use run.sh to download necessary dependencies.
Command line:
/ms_marco_metrics$ PYTHONPATH=./bleu python ms_marco_eval.py <path_to_reference_file> <path_to_candidate_file>
Creation Date : 12/15/2018
Last Modified : 03/20/2019
Authors : Tri Nguyen <trnguye@microsoft.com>, Xia Song <xiaso@microsoft.com>, Tong Wang <tongw@microsoft.com>, Daniel Campos <dacamp@microsoft.com>
"""
from __future__ import print_function
import json
import sys
import pandas as pd
import spacy
from bleu.bleu import Bleu
from rouge.rouge import Rouge
from spacy.lang.en import English as NlpEnglish
nlp = spacy.load('en_core_web_lg')
QUERY_ID_JSON_ID = 'query_id'
ANSWERS_JSON_ID = 'answers'
MAX_BLEU_ORDER = 4
NLP = None
EVALFILENAME = 'eval_v2.1.json'
TEMPNLGENCANDIDATE = 'NLGENCandidate.jsonl'
NLGENREFERENCE = 'NLGENEval.jsonl'
QNAREFERENCE = 'QnAEval.jsonl'
def make_well_formed(candidateFilename):
eval = pd.read_json(EVALFILENAME)
ids = {}
for row in eval.iterrows():
if row[1]['wellFormedAnswers'] != '[]':
ids[row[1]['query_id']] = 1
eval = eval.drop('answers',1)
eval = eval.rename(columns={'wellFormedAnswers':'answers'})
eval = eval[eval.answers != '[]']
with open(NLGENREFERENCE ,'w') as w:
for row in eval.iterrows():
w.write(str(row[1].to_json())+'\n')
with open(candidateFilename, 'r') as f:
with open(TEMPNLGENCANDIDATE,'w') as w:
for l in f:
j = json.loads(l)
if j['query_id'] in ids:
w.write(l)
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
"""Normalize and tokenize strings.
Args:
p_iter (iter): iter over strings to normalize and tokenize.
p_batch_size (int): number of batches.
p_thread_count (int): number of threads running.
Returns:
iter: iter over normalized and tokenized string.
"""
global NLP
if not NLP:
NLP = NlpEnglish(parser=False)
output_iter = NLP.pipe(p_iter, \
batch_size=p_batch_size, \
n_threads=p_thread_count)
for doc in output_iter:
tokens = [str(w).strip().lower() for w in doc]
yield ' '.join(tokens)
def load_file(p_path_to_data):
"""Load data from json file.
Args:
p_path_to_data (str): path to file to load.
File should be in format:
{QUERY_ID_JSON_ID: <a_query_id_int>,
ANSWERS_JSON_ID: [<list_of_answers_string>]}
Returns:
query_id_to_answers_map (dict):
dictionary mapping from query_id (int) to answers (list of strings).
no_answer_query_ids (set): set of query ids of no-answer queries.
"""
all_answers = []
query_ids = []
no_answer_query_ids = set()
with open(p_path_to_data, 'r', encoding='utf-8') as data_file:
for line in data_file:
try:
json_object = json.loads(line)
except json.JSONDecodeError:
raise Exception('\"%s\" is not a valid json' % line)
assert \
QUERY_ID_JSON_ID in json_object, \
'\"%s\" json does not have \"%s\" field' % \
(line, QUERY_ID_JSON_ID)
query_id = json_object[QUERY_ID_JSON_ID]
assert \
ANSWERS_JSON_ID in json_object, \
'\"%s\" json does not have \"%s\" field' % \
(line, ANSWERS_JSON_ID)
answers = json_object[ANSWERS_JSON_ID]
if 'No Answer Present.' in answers:
no_answer_query_ids.add(query_id)
answers = ['']
all_answers.extend(answers)
query_ids.extend([query_id]*len(answers))
all_normalized_answers = all_answers #normalize_batch(all_answers)
query_id_to_answers_map = {}
for i, normalized_answer in enumerate(all_normalized_answers):
query_id = query_ids[i]
if query_id not in query_id_to_answers_map:
query_id_to_answers_map[query_id] = []
query_id_to_answers_map[query_id].append(normalized_answer)
return query_id_to_answers_map, no_answer_query_ids
def compute_metrics_from_files(p_path_to_reference_file,
p_path_to_candidate_file,
p_max_bleu_order):
"""Compute BLEU-N and ROUGE-L metrics.
IMPORTANT: No-answer reference will be excluded from calculation.
Args:
p_path_to_reference_file (str): path to reference file.
p_path_to_candidate_file (str): path to candidate file.
Both files should be in format:
{QUERY_ID_JSON_ID: <a_query_id_int>,
ANSWERS_JSON_ID: [<list_of_answers_string>]}
p_max_bleu_order: the maximum n order in bleu_n calculation.
Returns:
dict: dictionary of {'bleu_n': <bleu_n score>, 'rouge_l': <rouge_l score>}
"""
reference_dictionary, reference_no_answer_query_ids = \
load_file(p_path_to_reference_file)
candidate_dictionary, candidate_no_answer_query_ids = load_file(p_path_to_candidate_file)
query_id_answerable = set(reference_dictionary.keys())-reference_no_answer_query_ids
query_id_answerable_candidate = set(candidate_dictionary.keys())-candidate_no_answer_query_ids
true_positives = len(query_id_answerable_candidate.intersection(query_id_answerable))
false_negatives = len(query_id_answerable)-true_positives
true_negatives = len(candidate_no_answer_query_ids.intersection(reference_no_answer_query_ids))
false_positives = len(reference_no_answer_query_ids)-true_negatives
precision = float(true_positives)/(true_positives+false_positives) if (true_positives+false_positives)>0 else 1.
recall = float(true_positives)/(true_positives+false_negatives) if (true_positives+false_negatives)>0 else 1.
F1 = 2 *((precision*recall)/(precision+recall))
filtered_reference_dictionary = \
{key: value for key, value in reference_dictionary.items() \
if key not in reference_no_answer_query_ids}
filtered_candidate_dictionary = \
{key: value for key, value in candidate_dictionary.items() \
if key not in reference_no_answer_query_ids}
for query_id, answers in filtered_candidate_dictionary.items():
assert \
len(answers) <= 1, \
'query_id %d contains more than 1 answer \"%s\" in candidate file' % \
(query_id, str(answers))
reference_query_ids = set(filtered_reference_dictionary.keys())
candidate_query_ids = set(filtered_candidate_dictionary.keys())
common_query_ids = reference_query_ids.intersection(candidate_query_ids)
assert (len(common_query_ids) == len(reference_query_ids)) and \
(len(common_query_ids) == len(candidate_query_ids)), \
'Reference and candidate files must share same query ids'
all_scores = {}
bleu_scores, _ = \
Bleu(p_max_bleu_order).compute_score(filtered_reference_dictionary, \
filtered_candidate_dictionary)
for i, bleu_score in enumerate(bleu_scores):
all_scores['bleu_%d' % (i+1)] = bleu_score
rouge_score, _ = Rouge().compute_score(filtered_reference_dictionary, \
filtered_candidate_dictionary)
all_scores['rouge_l'] = rouge_score
all_scores['F1'] = F1
return all_scores
def main():
"""Command line: /ms_marco_metrics$ PYTHONPATH=./bleu python ms_marco_eval.py <path_to_reference_file> <path_to_candidate_file>"""
path_to_candidate_file = sys.argv[1]
make_well_formed(path_to_candidate_file)
#Evaluate QA
print("QnA Metrics")
metrics = compute_metrics_from_files(QNAREFERENCE, path_to_candidate_file, MAX_BLEU_ORDER)
print('############################')
for metric in sorted(metrics):
print('%s: %s' % (metric, metrics[metric]))
print('############################')
#Evaluate NLGEN
print("NLGEN Metrics")
metrics = compute_metrics_from_files(NLGENREFERENCE, TEMPNLGENCANDIDATE, MAX_BLEU_ORDER)
print('############################')
for metric in sorted(metrics):
print('%s: %s' % (metric, metrics[metric]))
print('############################')
if __name__ == "__main__":
main()