forked from zhoujx4/NLP-Series-sentence-embeddings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_unsup_simcse.py
94 lines (81 loc) · 4.28 KB
/
run_unsup_simcse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import logging
import math
from datetime import datetime
import torch
from sentence_transformers import InputExample, SentenceTransformer, LoggingHandler
from sentence_transformers import models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from torch.utils.data import DataLoader
from data.dataset import load_STS_data
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
# 训练参数
model_name = '/data/junxian/PTMs/chinese-macbert-base'
train_batch_size = 64
num_epochs = 2
max_seq_length = 64
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = "cuda:1"
# 模型保存路径
model_save_path = '/data/junxian/NLP-Series-sentence-embeddings/output/stsb_simcse-{}-{}-{}'.format("macbert",
train_batch_size,
datetime.now().strftime(
"%Y-%m-%d_%H-%M-%S"))
# 建立模型
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(),
pooling_mode="cls",
pooling_mode_cls_token=True)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=device)
model[0].auto_model.attention_probs_dropout_prob = 0.1 #
model[0].auto_model.hidden_dropout_prob = 0.1 #
# 准备训练集
sts_vocab = load_STS_data("/data/junxian/STS-B/cnsd-sts-train.txt")
all_vocab = [x[0] for x in sts_vocab] + [x[1] for x in sts_vocab]
simCSE_data = all_vocab
print("The len of SimCSE unsupervised data is {}".format(len(simCSE_data)))
train_samples = []
for data in all_vocab:
train_samples.append(InputExample(texts=[data, data]))
# 准备验证集和测试集
dev_data = load_STS_data("/data/junxian/STS-B/cnsd-sts-dev.txt")
test_data = load_STS_data("/data/junxian/STS-B/cnsd-sts-test.txt")
dev_samples = []
test_samples = []
for data in dev_data:
dev_samples.append(InputExample(texts=[data[0], data[1]], label=data[2] / 5.0))
for data in test_data:
test_samples.append(InputExample(texts=[data[0], data[1]], label=data[2] / 5.0))
# 初始化评估器
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size,
name='sts-dev',
main_similarity=SimilarityFunction.COSINE)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size,
name='sts-test',
main_similarity=SimilarityFunction.COSINE)
# We train our model using the MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
evaluation_steps = int(len(train_dataloader) * 0.1) # Evaluate every 10% of the data
logging.info("Training sentences: {}".format(len(train_samples)))
logging.info("Warmup-steps: {}".format(warmup_steps))
logging.info("Performance before training")
dev_evaluator(model)
# 模型训练
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=evaluation_steps,
warmup_steps=warmup_steps,
show_progress_bar=False,
output_path=model_save_path,
optimizer_params={'lr': 2e-5},
use_amp=False # Set to True, if your GPU supports FP16 cores
)
# 测试集上的表现
model = SentenceTransformer(model_save_path)
test_evaluator(model, output_path=model_save_path)