-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodel.py
268 lines (211 loc) · 9.34 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from tokenize import tokenize
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
# BERTSentenceTransform
class BERTSentenceTransform:
r"""BERT style data transformation.
Parameters
----------
tokenizer : BERTTokenizer.
Tokenizer for the sentences.
max_seq_length : int.
Maximum sequence length of the sentences.
pad : bool, default True
Whether to pad the sentences to maximum length.
pair : bool, default True
Whether to transform sentences or sentence pairs.
"""
def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
self._tokenizer = tokenizer
self._max_seq_length = max_seq_length
self._pad = pad
self._pair = pair
self._vocab = vocab
def __call__(self, line):
"""Perform transformation for sequence pairs or single sequences.
The transformation is processed in the following steps:
- tokenize the input sequences
- insert [CLS], [SEP] as necessary
- generate type ids to indicate whether a token belongs to the first
sequence or the second sequence.
- generate valid length
For sequence pairs, the input is a tuple of 2 strings:
text_a, text_b.
Inputs:
text_a: 'is this jacksonville ?'
text_b: 'no it is not'
Tokenization:
text_a: 'is this jack ##son ##ville ?'
text_b: 'no it is not .'
Processed:
tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
valid_length: 14
For single sequences, the input is a tuple of single string:
text_a.
Inputs:
text_a: 'the dog is hairy .'
Tokenization:
text_a: 'the dog is hairy .'
Processed:
text_a: '[CLS] the dog is hairy . [SEP]'
type_ids: 0 0 0 0 0 0 0
valid_length: 7
Parameters
----------
line: tuple of str
Input strings. For sequence pairs, the input is a tuple of 2 strings:
(text_a, text_b). For single sequences, the input is a tuple of single
string: (text_a,).
Returns
-------
np.array: input token ids in 'int32', shape (batch_size, seq_length)
np.array: valid length in 'int32', shape (batch_size,)
np.array: input token type ids in 'int32', shape (batch_size, seq_length)
"""
# convert to unicode
text_a = line[0]
if self._pair:
assert len(line) == 2
text_b = line[1]
tokens_a = self._tokenizer.tokenize(text_a)
tokens_b = None
if self._pair:
tokens_b = self._tokenizer(text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self._truncate_seq_pair(tokens_a, tokens_b,
self._max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > self._max_seq_length - 2:
tokens_a = tokens_a[0:(self._max_seq_length - 2)]
# The embedding vectors for `type=0` and `type=1` were learned during
# pre-training and are added to the wordpiece embedding vector
# (and position vector). This is not *strictly* necessary since
# the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
#vocab = self._tokenizer.vocab
vocab = self._vocab
tokens = []
tokens.append(vocab.cls_token)
tokens.extend(tokens_a)
tokens.append(vocab.sep_token)
segment_ids = [0] * len(tokens)
if tokens_b:
tokens.extend(tokens_b)
tokens.append(vocab.sep_token)
segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
# The valid length of sentences. Only real tokens are attended to.
valid_length = len(input_ids)
if self._pad:
# Zero-pad up to the sequence length.
padding_length = self._max_seq_length - valid_length
# use padding tokens for the rest
input_ids.extend([vocab[vocab.padding_token]] * padding_length)
segment_ids.extend([0] * padding_length)
return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
np.array(segment_ids, dtype='int32')
# 하이퍼 파라미터 설정
max_len = 100
batch_size = 16 # 64
warmup_ratio = 0.1
num_epochs = 2 ###### 에폭 수 조정 부분
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5
# BERTDataset
class BERTDataset(Dataset):
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
pad, pair):
transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
self.sentences = [transform([i[sent_idx]]) for i in dataset]
self.labels = [np.int32(i[label_idx]) for i in dataset]
def __getitem__(self, i):
return (self.sentences[i] + (self.labels[i], ))
def __len__(self):
return (len(self.labels))
# kobert git - get_kobert_model
def get_kobert_model(model_path, vocab_file, ctx="cpu"):
bertmodel = BertModel.from_pretrained(model_path)
device = torch.device(ctx)
bertmodel.to(device)
bertmodel.eval()
vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
padding_token='[PAD]')
return bertmodel, vocab_b_obj
# BERTClassifier
class BERTClassifier(nn.Module):
def __init__(self,
bert,
hidden_size = 768,
num_classes = 5, ##### 분류 클래스 수 조정
dr_rate = None,
params = None):
super(BERTClassifier, self).__init__()
self.bert = bert
self.dr_rate = dr_rate
self.classifier = nn.Linear(hidden_size , num_classes)
if dr_rate:
self.dropout = nn.Dropout(p = dr_rate)
def gen_attention_mask(self, token_ids, valid_length):
attention_mask = torch.zeros_like(token_ids)
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids):
attention_mask = self.gen_attention_mask(token_ids, valid_length)
_, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
if self.dr_rate:
out = self.dropout(pooler)
return self.classifier(out)
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
model = BERTClassifier(bertmodel, dr_rate = 0.5)
model.load_state_dict(torch.load('C:\\KGU\\model\\test\\3250_made_test_state.pt'), strict=False)
def predict(predict_sentence):
data = [predict_sentence, '0']
dataset_another = [data]
another_test = BERTDataset(dataset_another, 0, 1, tokenizer, vocab, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
token_ids = token_ids.long()
segment_ids = segment_ids.long()
valid_length= valid_length
label = label.long()
out = model(token_ids, valid_length, segment_ids)
test_eval=[]
for i in out:
logits=i
logits = logits.detach().cpu().numpy()
if np.argmax(logits) == 0:
test_eval.append("건강")
elif np.argmax(logits) == 1:
test_eval.append("돈관리")
elif np.argmax(logits) == 2:
test_eval.append("공부")
elif np.argmax(logits) == 3:
test_eval.append("취미")
elif np.argmax(logits) == 4:
test_eval.append("일상")
print(">>>>> 예측 카테고리 : " + test_eval[0])
return test_eval[0]