-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrealsumm.py
112 lines (84 loc) · 4.72 KB
/
realsumm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import pickle
import typing
import pandas
import evalbase
import os
from eval_utils import eval_and_write
def clean_text(s: str):
"""Clean up the text in doc or summ in RealSumm dataset
including, removing HTML tags, unescap HTML control sequences
"""
s = s.replace("<t>", "")
s = s.replace("</t>", "")
s = s.replace("\t", " ")
s = s.strip()
return s
def load_realsumm(pickfile: str):
"""
In [4]: sd = pickle.load(open('abs.pkl','rb'))
In [5]: sd.keys()
Out[5]: dict_keys([52, 2, 23, 62, 6, 92, 73, 47, 86, 32, 67, 42, 36, 3, 50, 35, 17, 45, 16, 48, 95, 91, 89, 85, 74, 28, 49, 58, 12, 33, 14, 9, 8, 29, 43, 39, 38, 84, 57, 31, 21, 71, 15, 24, 78, 5, 90, 87, 51, 40, 1, 46, 66, 56, 7, 88, 72, 77, 34, 68, 26, 64, 18, 76, 30, 80, 61, 99, 79, 41, 27, 94, 22, 4, 82, 0, 10, 37, 25, 60, 70, 13, 19, 75, 69, 54, 65, 81, 98, 11, 55, 44, 59, 20, 83, 96, 97, 93, 53, 63])
In [6]: ! ls -lht
total 4.2M
-rw-rw-r-- 1 forrest forrest 956K Sep 12 15:15 ext_ours.pkl
-rw-rw-r-- 1 forrest forrest 1.1M Sep 12 15:15 ext.pkl
-rw-rw-r-- 1 forrest forrest 1.1M Sep 12 15:15 abs_ours.pkl
-rw-rw-r-- 1 forrest forrest 1.2M Sep 12 15:15 abs.pkl
In [7]: sdp[52].keys()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 sdp[52].keys()
NameError: name 'sdp' is not defined
In [8]: sd[52].keys()
Out[8]: dict_keys(['doc_id', 'ref_summ', 'system_summaries'])
In [9]: sd[52]['doc_id']
Out[9]: 52
In [10]: sd[52]['ref_summ']
Out[10]: "<t> manchester united take on manchester city on sunday . </t> <t> match will begin at 4pm local time at united 's old trafford home . </t> <t> police have no objections to kick-off being so late in the afternoon . </t> <t> last late afternoon weekend kick-off in the manchester derby saw 34 fans arrested at wembley in 2011 fa cup semi-final . </t>"
In [11]: sd[52]['system_summaries'].keys()
Out[11]: dict_keys(['presumm_out_trans_abs.txt', 'two_stage_rl_out.txt', 'unilm_out_v2.txt', 't5_out_large.txt', 'presumm_out_ext_abs.txt', 'ptr_generator_out_pointer_gen_cov.txt', 'bart_out.txt', 'fast_abs_rl_out_rerank.txt', 't5_out_11B.txt', 'presumm_out_abs.txt', 'bottom_up_out.txt', 'unilm_out_v1.txt', 't5_out_base.txt', 'semsim_out.txt'])
In [12]:
"""
sd = pickle.load(open(pickfile, 'rb'))
# 1. Get documents
# code from https://github.com/forrestbao/SueNes/blob/newsroom/human/realsumm/generate_test.py
used_test_id = [1017, 10586, 11343, 1521, 2736, 3789, 5025, 5272, 5576, 6564, 7174, 7770, 8334, 9325, 9781, 10231,
10595, 11351, 1573, 2748, 3906, 5075, 5334, 5626, 6714, 7397, 7823, 8565, 9393, 9825, 10325, 10680,
11355, 1890, 307, 4043, 5099, 5357, 5635, 6731, 7535, 7910, 8613, 9502, 10368, 10721, 1153, 19,
3152, 4303, 5231, 5420, 5912, 6774, 7547, 8001, 8815, 9555, 10537, 10824, 1173, 1944, 3172, 4315,
5243, 5476, 6048, 6784, 7584, 8054, 8997, 9590, 10542, 11049, 1273, 2065, 3583, 4637, 5244, 5524,
6094, 6976, 7626, 8306, 9086, 9605, 10563, 11264, 1492, 2292, 3621, 4725, 5257, 5558, 6329, 7058,
7670, 8312, 9221, 9709]
cnndm_test_articles = []
# FIXME: This is cyclic import
with open(os.path.join(evalbase.path, "dataloader/src.txt"), "r", encoding="utf-8") as f:
cnndm_test_articles = list(f)
used_articles = [cnndm_test_articles[i] for i in used_test_id]
dataset_df = pandas.DataFrame([], )
for sample in sd.values():
doc_id = sample['doc_id']
doc = used_articles[doc_id]
ref_summ = sample['ref_summ']
for sys_name in sample['system_summaries']:
sys_summ = sample['system_summaries'][sys_name]['system_summary']
score_dict = sample['system_summaries'][sys_name]['scores']
row = {
"doc_id": [doc_id],
"ArticleText": [clean_text(doc)],
"ReferenceSummary": [clean_text(ref_summ)],
"system": [sys_name[:-4]],
"SystemSummary": [clean_text(sys_summ)],
}
for score_name, score in score_dict.items():
row[score_name] = [score]
# human rating is in row['litepyramid_recall']
tmp_dataset_df = pandas.DataFrame.from_dict(row)
dataset_df = pandas.concat([
dataset_df, tmp_dataset_df])
return dataset_df
def main(exp_config):
dataset_name = exp_config["dataset_name"]
dataset_df = load_realsumm(exp_config["data_path"])
eval_and_write(dataset_name, dataset_df, exp_config)