-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
96 lines (81 loc) · 2.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import torchtext
def get_data(path):
train_data = []
with open(path, encoding='utf-8') as f:
words = []
tags = []
for line in f:
line = line.strip()
if not line:
train_data.append([words, tags])
words = []
tags = []
else:
columns = line.split()
words.append(str(columns[0]))
tags.append(str(columns[-1]))
return train_data
def read_file(path, data_fields):
with open(path, encoding='utf-8') as f:
examples = []
words = []
tags = []
for line in f:
line = line.strip()
if not line:
examples.append(torchtext.data.Example.fromlist([words, tags], data_fields))
words = []
tags = []
else:
columns = line.split()
words.append(columns[0])
tags.append(columns[-1])
return torchtext.data.Dataset(examples, data_fields)
# nếu là số sẽ chuyển về 0, ví dụ covid-19 -> covid-00 hay 20-11-2001 -> 00-00-0000
def normalize_word(word):
new_word = ""
for char in word:
if char.isdigit():
new_word += '0'
else:
new_word += char
return new_word
def get_sent_by_tag(tag, path):
with open(path, encoding='utf-8') as f:
examples = []
words = []
tags = []
for line in f:
line = line.strip()
if not line:
if tag in tags:
examples.append([words, tags])
words = []
tags = []
else:
columns = line.split()
words.append(columns[0])
tags.append(columns[-1])
return examples
def get_instances_by_tag(dataset, tag):
'''
:param tag: ví dụ như JOB, ORGANIZATION, ....
:return:
'''
# print(dataset)
instances = [] # ví dụ job thì sẽ là: [[giáo viên], [công nhân], [y_tá, điều_dưỡng],...]
for ex in dataset:
tags = ex[1]
for i in range(len(tags)):
if tags[i] == f'B-{tag}':
j = i + 1
instance = [ex[0][i]]
if j < len(tags):
while tags[j] == f'I-{tag}':
instance.append(ex[0][j])
j = j + 1
if j >= len(tags):
break
if instance not in instances:
instances.append(instance)
return instances