-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_sub2.py
275 lines (242 loc) · 10 KB
/
make_sub2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import json
import requests
from tqdm import tqdm
import jieba
import jsonlines
# http://shuyantech.com/api/cndbpedia/ment2ent?q= #! 根据单词返回实体列表
# http://shuyantech.com/api/cndbpedia/avpair?q= #! 输入实体名返回三元知识(word)
# http://shuyantech.com/api/cnprobase/ment2ent?q= #! 输入单词返回实体列表
# http://shuyantech.com/api/cnprobase/concept?q= #! 输入实体返回实体概念列表
# http://shuyantech.com/api/entitylinking/cutsegment?q= #!分词筛选出现的概念列表
# 输入单词或短语,返回实体列表列表(取代http://shuyantech.com/api/entitylinking/cutsegment?q=的更精确功能)
def word2entityL(w):
url = "http://shuyantech.com/api/cndbpedia/ment2ent?q=" + \
w + "&apikey=6474c03336e13f7d130ab3eadff84b8"
r = requests.get(url)
cnt = 0
while (r.status_code != 200 and cnt < 5):
r = requests.get(url)
cnt += 1
e_list = r.json()["ret"]
if len(e_list):
return e_list
return []
def entity2desc(e):
# ensure the entity can be find in KG
url = "http://shuyantech.com/api/cndbpedia/avpair?q=" + \
e + "&apikey=6474c03336e13f7d130ab3eadff84b8"
r = requests.get(url)
cnt = 0
desc = ""
cate = []
while (r.status_code != 200 and cnt < 5):
r = requests.get(url)
cnt += 1
rw_list = r.json()["ret"]
for rw in rw_list:
if rw[0] == "DESC":
desc = rw[1]
# 没有的话.需要统计比例,并用另一种内容替代
if rw[0] == "CATEGORY_ZH":
cate.append(rw[1])
if len(desc) == 0:
desc = "无"
return desc, cate
# filter 字词,语言,名词
stopConcept = ["字词", "词语"]
def get_hint(e):
url = "http://shuyantech.com/api/cnprobase/concept?q=" + \
e + "&apikey=6474c03336e13f7d130ab3eadff84b8"
r = requests.get(url)
cnt = 0
while (r.status_code != 200 and cnt < 5):
r = requests.get(url)
cnt += 1
e_list = r.json()["ret"]
e_return = []
if len(e_list):
for x in e_list:
if x[0] not in stopConcept:
e_return.append(x[0])
return e_return
return []
def get_hint_complex(ce) :
hl = get_hint(ce)
if len(hl) > 0:
return hl
else:
return ["无"]
hint_dict = {"成语": ["熟语"], "植物": ["植物"]}
n = 2000
with open("finaltest.json", "r", encoding= 'utf-8') as f1,\
open("test_desc1420-2000.json", "w",encoding= 'utf-8') as f2,\
open("tryinfo1420-2000.txt", "w", encoding='utf-8') as f3:
for i in tqdm(range(n)):
line = f1.readline()
if i < 1420:
continue
d_line = json.loads(line) # json按照dict形式存储
# 对于choice,需要对于每一个list获得entity
choices = d_line["choice"] # choice列表
# label = d_line["label"] # label
hint = d_line["hint"]
entity_final = [] # choice对应的entity列表
desc_final = []
hc_final = []
in_hint = []
# get hint concept,会有一个或者多个,但是如果hint直接能够搜索到concept,视为hint已经被选择出
flag_hint = -1 # 选定hint的角标
hint_concepts = [] # 最终根据hint_flag 选择元素
hint_concept = get_hint(hint)
# 已经可以确定
if len(hint_concept) > 0:
hint_concepts.append(hint_concept)
flag_hint = 0
# 不可以确定,需要分词后检索
else:
f = 0
tmp = []
hw_l = jieba.lcut_for_search(hint)
he_l = []
for hw in hw_l:
# 如果所有都加入的话,那么在check的时候需要选择出最合适的hint,方法是什么呢?[如果得分根据某两个最高,那么直接认为是这样的][如果没有利用到这一条规则的choice,则选择label]
# 一维列表
he_l += word2entityL(hw)
for he in he_l:
if len(get_hint(he)) > 0:
tmp.append(get_hint(he))
f = 1
if f == 0:
hint_concepts.append(["无"])
else:
hint_concepts = tmp
if hint in hint_dict:
hint_concepts = [hint_dict[hint]]
flag_hint = 0
entityconcept_final = []
hw_l = jieba.lcut_for_search(hint)
hw_l.append(hint)
for c_idx, c in enumerate(choices):
# get entities for each choice
flag_entity = 0 #! 选择哪一个实体作为结果
flag_in = -1 #! 判断是否属于hint
c_entities = word2entityL(c)
if len(c_entities) > 0:
# c_entities = d_line["entities"][c_idx]
pass
else:
# cut word for null entites try to get extra entities
cw_l = jieba.lcut_for_search(c)
for cw in cw_l:
c_entities += word2entityL(cw)
if len(c_entities) == 0:
c_entities += ["无"] # 专门处理,可以预计到后面的结果了,设置为"无"
# 获得实体的DESC
c_descs = []
c_category = []
for ce in c_entities:
desc, category = entity2desc(ce)
c_descs.append(desc)
c_category.append(category)
# 获得实体的concept,即使为空,也进行保留,因为实体是可以查询的最小单元,不应再次分词
c_concepts = [get_hint_complex(ce) for ce in c_entities] # 是二维数组
# lv1
if flag_in == -1:
for ce_idx, ce in enumerate(c_entities):
if len(set(hw_l).intersection(set(c_concepts[ce_idx]))) > 0:
flag_in = 1
flag_entity = ce_idx
elif len(set(hw_l).intersection(set(c_category[ce_idx]))) > 0:
flag_in = 1
flag_entity = ce_idx
break
# lv2
if flag_in == -1:
for ce_idx, ce in enumerate(c_entities):
fl = 0
for hw in hw_l:
if hw in c_descs[ce_idx]:
flag_in = 1
flag_entity = ce_idx
fl = 1
break
if fl == 1:
break
# lv3 desc
if flag_in == -1:
for ce_idx, ce in enumerate(c_entities):
fl = 0
if flag_hint == -1:
for hc_idx,hint_concept in enumerate(hint_concepts):
for hc in hint_concept:
if hc in c_descs[ce_idx]:
flag_in = 1
flag_entity = ce_idx
flag_hint = hc_idx
fl = 1
break
if fl == 1:
break
if fl == 1:
break
else:
for hc_idx,hint_concept in enumerate(hint_concepts):
for hc in hint_concept:
if hc in c_descs[ce_idx]:
flag_in = 1
flag_entity = ce_idx
flag_hint = hc_idx
fl = 1
break
if fl == 1:
break
if fl == 1:
break
# lv4
if flag_in == -1:
for ce_idx, ce in enumerate(c_entities):
fl = 0
if flag_hint == -1:
for hc_idx,hint_concept in enumerate(hint_concepts):
if len(set(hint_concept).intersection(set(c_concepts[ce_idx]))) > 0:
flag_in = 1
flag_entity = ce_idx
flag_hint = hc_idx
fl = 1
break
elif len(set(hint_concept).intersection(set(c_category[ce_idx]))) > 0:
flag_in = 1
flag_entity = ce_idx
flag_hint = hc_idx
fl = 1
break
else:
if len(set(hint_concepts[flag_hint]).intersection(set(c_concepts[ce_idx]))) > 0:
flag_in = 1
flag_entity = ce_idx
fl = 1
break
elif len(set(hint_concepts[flag_hint]).intersection(set(c_category[ce_idx]))) > 0:
flag_in = 1
flag_entity = ce_idx
fl = 1
break
if fl == 1:
break
if flag_in == -1:
flag_in = 0
info = "hint: " + hint + "choice: " + c + '\n'
f3.write(info)
entity_final.append(c_entities[flag_entity])
entityconcept_final.append(c_concepts[flag_entity])
desc_final.append(c_descs[flag_entity])
in_hint.append(flag_in)
if flag_hint == -1:
flag_hint = 0
d_line["in_hint"] = in_hint
d_line["entities"] = entity_final
d_line["hint_concept"] = hint_concepts[flag_hint]
d_line["entity_concept"] = entityconcept_final
d_line["desc"] = desc_final
json.dump(d_line, f2, ensure_ascii=False)
f2.write('\n')