-
Notifications
You must be signed in to change notification settings - Fork 29
/
add_drop_entity.py
120 lines (97 loc) · 4.25 KB
/
add_drop_entity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import pandas as pd
from utils import concat, data_path, train_path, load_data
def get_entity_data(data):
data = data[data["negative"] == 1]
data["text"] = data.apply(lambda x: concat(x['title'], x['text']), axis=1)
id = []
text = []
entities = []
label = []
for i in range(len(data)):
entity = data["entity"].iloc[i].split(';')
try:
key_entity = data["key_entity"].iloc[i].split(';')
except Exception as error:
key_entity = []
for e in entity:
if e is "":
continue
id.append(data["id"].iloc[i])
text.append(data["text"].iloc[i])
entities.append(e)
if e in key_entity:
label.append(1)
else:
label.append(0)
entity_data = pd.DataFrame({"id": id, "text": text, "entity": entities, "label": label})
return entity_data
def not_in_text(x):
entity = x["entity"]
text = x["text"]
if entity not in text:
return 1
else:
return 0
def add_entity(x, negative_entity_list):
if x["flag"] != 1:
return x["label"]
else:
if x["entity"] in negative_entity_list:
return 1
else:
return x["label"]
def drop_entity(x, positive_entity_list):
if x["flag"] != 1:
return x["label"]
else:
if x["entity"] in positive_entity_list:
return 0
else:
return x["label"]
def to_submit_format(data, test_data, file=""):
submit = {}
for i in range(len(data)):
id = data.iloc[i]["id"]
entity = data.iloc[i]["entity"]
label = data.iloc[i]["label"]
if id not in submit.keys():
submit[id] = []
if label == 1:
submit[id].append((entity, 1))
else:
submit[id].append((entity, 0))
submit = pd.DataFrame({"id": list(submit.keys()), "entity": list(submit.values())})
submit["key_entity"] = submit["entity"].apply(lambda x: "" if len(x) == 0 else ';'.join(set([_ for _, l in x if l == 1])))
submit = pd.merge(test_data[["id", "negative"]], submit[["id", "key_entity"]], on="id", how="left")
if file:
submit[["id", "negative", "key_entity"]].to_csv(os.path.join(data_path, "submit", file), encoding='utf-8', index=False)
if __name__ == "__main__":
train_data, test_data = load_data()
submit_data = pd.read_csv(os.path.join(data_path, "submit", "fuxian_result.csv"), encoding='utf-8')
test_data = pd.merge(test_data, submit_data, on="id")
train_data = get_entity_data(train_data)
test_data = get_entity_data(test_data)
train_data["flag"] = train_data.apply(lambda x: not_in_text(x), axis=1)
train_data[train_data["flag"] == 1].to_csv(os.path.join(data_path, "entity_train_not_in_text.csv"),
encoding='utf-8-sig', index=False)
test_data["flag"] = test_data.apply(lambda x: not_in_text(x), axis=1)
entity_counter = {}
negative_counter = {}
positive_counter = {}
train_data = pd.read_csv(os.path.join(data_path, "entity_train_not_in_text.csv"), encoding='utf-8-sig')
for i in range(len(train_data)):
entity_counter[train_data.iloc[i]["entity"]] = entity_counter.get(train_data.iloc[i]["entity"], 0) + 1
if train_data.iloc[i]["label"] == 0:
positive_counter[train_data.iloc[i]["entity"]] = positive_counter.get(train_data.iloc[i]["entity"], 0) + 1
else:
negative_counter[train_data.iloc[i]["entity"]] = negative_counter.get(train_data.iloc[i]["entity"], 0) + 1
for key, value in negative_counter.items():
negative_counter[key] = negative_counter[key] / entity_counter[key]
for key, value in positive_counter.items():
positive_counter[key] = positive_counter[key] / entity_counter[key]
negative_entity_list = [entity for entity, _ in negative_counter.items() if _ == 1.0]
positive_entity_list = [entity for entity, _ in positive_counter.items() if _ == 1.0]
test_data["label"] = test_data.apply(lambda x: add_entity(x, negative_entity_list), axis=1)
test_data["label"] = test_data.apply(lambda x: drop_entity(x, positive_entity_list), axis=1)
to_submit_format(test_data, submit_data, "fuxian_add_drop.csv")