-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_duie_relation_to_fastlabel.py
130 lines (102 loc) · 4.54 KB
/
convert_duie_relation_to_fastlabel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# @Time : 2022/4/10 20:07
# @Author : tk
import json
def get_label_from_entity(text,pos,entities):
for label in entities:
o = entities[label]
if text not in o:
continue
pt_list = o[text]
for pt in pt_list:
if pt[0] == pos[0] and pt[1] == pos[1]:
return label
return None
def get_pos(text: str,k: str):
assert k != '',ValueError('find error k=',k)
p = text.find(k)
assert p != -1, k
if p == -1:
return None
return [p,p+len(k) -1]
def convert2fastlabel(in_file,out_file):
with open(in_file,mode='r',encoding='utf-8') as f:
lines = f.readlines()
with open(out_file,mode='w',encoding='utf-8',newline='\n') as f_out:
for i,line in enumerate(lines):
jd = json.loads(line)
text = jd['text']
entities = {}
re_list = []
spo_list = jd['spo_list']
try:
for spo in spo_list:
subject = spo['subject']
subject_type = spo['subject_type']
predicate = spo['predicate']
objects = spo['object']
object_types = spo['object_type']
if subject_type not in entities:
entities[subject_type] = {}
if subject not in entities[subject_type]:
entities[subject_type][subject] = []
s_pos = get_pos(text,subject)
entities[subject_type][subject].append(s_pos)
if s_pos is not None:
for k in objects.keys():
object = objects[k]
object_type = object_types[k]
if object_type not in entities:
entities[object_type] = {}
if object_type not in entities[object_type]:
entities[object_type][object] = []
o_pos = get_pos(text, object)
entities[object_type][object].append(o_pos)
re_list.append({
predicate:[
{
'entity': subject,
'pos': s_pos,
'label': subject_type
},
{
'entity': object,
'pos': o_pos,
'label': object_type
}
]
})
f_out.write(json.dumps({
"id": i,
"text": text,
"entities": entities,
're_list': re_list
}, ensure_ascii=False) + '\n')
except Exception as e:
print(text,' error,',e)
continue
def convert2labels(src,dst):
with open(src, mode='r', encoding='utf-8') as f:
lines = f.readlines()
with open(dst, mode='w', encoding='utf-8', newline='\n') as f_out:
labels = set()
for line in lines:
jd = json.loads(line)
if not jd:
continue
for o in list(jd['object_type'].values()):
labels.add((jd['subject_type'],jd['predicate'],o))
print(labels)
print(len(labels))
for l in labels:
d = {"subject": l[0], "predicate": l[1], "object": l[2]}
f_out.write(json.dumps(d,ensure_ascii=False) + '\n')
if __name__ == "__main__":
in_file = r'F:\nlpdata_2022\比赛\百度关系\关系抽取\json\duie_train.json'
out_file =r'F:\nlpdata_2022\比赛\百度关系\关系抽取\fastlabel_json\duie_train.json'
convert2fastlabel(in_file,out_file)
in_file = r'F:\nlpdata_2022\比赛\百度关系\关系抽取\json\duie_dev.json'
out_file = r'F:\nlpdata_2022\比赛\百度关系\关系抽取\fastlabel_json\duie_dev.json'
convert2fastlabel(in_file, out_file)
in_file = r'F:\nlpdata_2022\比赛\百度关系\关系抽取\json\duie_schema.json'
out_file = r'F:\nlpdata_2022\比赛\百度关系\关系抽取\fastlabel_json\duie_schema.json'
convert2labels(in_file, out_file)