-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBertPractice.py
207 lines (175 loc) · 6.41 KB
/
BertPractice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import torch.nn as nn
from transformers import AdamW
from torch.utils.data import Dataset
import pandas as pd
import torch
from transformers import BertConfig, BertForSequenceClassification
from transformers import BertTokenizer
from torch.utils.data import DataLoader
from sklearn import metrics
# 超参数
hidden_dropout_prob = 0.3
num_labels = 3
learning_rate = 1e-5
weight_decay = 1e-2
epochs = 3
batch_size = 16
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# data_path = "BertPractice/sentiment/" # linux
# data_path = "sentiment\\" # windows
# vocab_file = data_path + "vocab.txt" # 词汇表
# train_data = data_path + "sentiment.train.data" # 训练数据集
# valid_data = data_path + "sentiment.valid.data" # 验证数据集
train_data = "data/bert/train.csv"
valid_data = "data/bert/test.csv"
pretrained_model_name = "bert-base-chinese"
# 定义 Dataset
class SentimentDataset(Dataset):
def __init__(self, path_to_file):
self.dataset = pd.read_csv(path_to_file, sep="\t", names=["text", "label"])
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
# 根据 idx 分别找到 text 和 label
text = self.dataset.loc[idx, "text"]
label = self.dataset.loc[idx, "label"]
sample = {"text": text, "label": label}
# 返回一个 dict
return sample
# 加载训练集
sentiment_train_set = SentimentDataset(train_data)
sentiment_train_loader = DataLoader(
sentiment_train_set, batch_size=batch_size, shuffle=True, num_workers=0
)
# 加载验证集
sentiment_valid_set = SentimentDataset(valid_data)
sentiment_valid_loader = DataLoader(
sentiment_valid_set, batch_size=batch_size, shuffle=False, num_workers=0
)
# 定义 tokenizer,传入词汇表
# tokenizer = BertTokenizer(vocab_file)
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
# 加载模型
config = BertConfig.from_pretrained(
pretrained_model_name,
num_labels=num_labels,
hidden_dropout_prob=hidden_dropout_prob,
)
model = BertForSequenceClassification.from_pretrained(
pretrained_model_name, config=config
)
model.to(device)
# 定义优化器和损失函数
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": weight_decay,
},
{
"params": [
p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
# optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
criterion = nn.CrossEntropyLoss()
# 定义训练的函数
def train(model, dataloader, optimizer, criterion, device):
model.train()
epoch_loss = 0
epoch_acc = 0
for i, batch in enumerate(dataloader):
# 标签形状为 (batch_size, 1)
label = batch["label"].to(device)
text = batch["text"]
# tokenized_text 包括 input_ids, token_type_ids, attention_mask
tokenized_text = tokenizer(
text,
max_length=100,
add_special_tokens=True,
truncation=True,
padding=True,
return_tensors="pt",
)
tokenized_text = tokenized_text.to(device)
# 梯度清零
optimizer.zero_grad()
# output: (loss), logits, (hidden_states), (attentions)
output = model(**tokenized_text, labels=label)
# y_pred_prob = logits : [batch_size, num_labels]
y_pred_prob = output[1]
y_pred_label = y_pred_prob.argmax(dim=1)
# 计算loss
# 这个 loss 和 output[0] 是一样的
loss = criterion(y_pred_prob, label.view(-1))
# 计算acc
acc = ((y_pred_label == label.view(-1)).sum()).item()
# 反向传播
loss.backward()
optimizer.step()
# epoch 中的 loss 和 acc 累加
# loss 每次是一个 batch 的平均 loss
epoch_loss += loss.item()
# acc 是一个 batch 的 acc 总和
epoch_acc += acc
if i % 200 == 0:
print(
"current loss:",
epoch_loss / (i + 1),
"\t",
"current acc:",
epoch_acc / ((i + 1) * len(label)),
)
# len(dataloader) 表示有多少个 batch,len(dataloader.dataset.dataset) 表示样本数量
return epoch_loss / len(dataloader), epoch_acc / len(dataloader.dataset.dataset)
def evaluate(model, iterator, device):
model.eval()
epoch_loss = 0
epoch_acc = 0
f1_score = 0
with torch.no_grad():
for _, batch in enumerate(iterator):
label = batch["label"].to(device)
text = batch["text"]
tokenized_text = tokenizer(
text,
max_length=100,
add_special_tokens=True,
truncation=True,
padding=True,
return_tensors="pt",
)
tokenized_text = tokenized_text.to(device)
output = model(**tokenized_text, labels=label)
y_pred_label = output[1].argmax(dim=1)
loss = output[0]
acc = ((y_pred_label == label.view(-1)).sum()).item()
# epoch 中的 loss 和 acc 累加
# loss 每次是一个 batch 的平均 loss
epoch_loss += loss.item()
# acc 是一个 batch 的 acc 总和
epoch_acc += acc
f1_score += metrics.f1_score(label.cpu(), y_pred_label.cpu(), average="micro")
# len(dataloader) 表示有多少个 batch,len(dataloader.dataset.dataset) 表示样本数量
return (
epoch_loss / len(iterator),
epoch_acc / len(iterator.dataset.dataset),
f1_score / len(iterator),
)
# 开始训练和验证
for i in range(epochs):
train_loss, train_acc = train(
model, sentiment_train_loader, optimizer, criterion, device
)
print("train loss: ", train_loss, "\t", "train acc:", train_acc)
valid_loss, valid_acc, valid_f1_score = evaluate(model, sentiment_valid_loader, device)
print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc, "\t", "f1_score:", valid_f1_score)
torch.save(model,f"module/bert/epoch{i}_{valid_loss}_{valid_acc}_{valid_f1_score}.full")