forked from PaddlePaddle/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
172 lines (152 loc) · 5.72 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
parser.add_argument(
'--is_local', type=int, default=1, help='whether is local')
parser.add_argument(
'--hid_size', type=int, default=100, help='hidden-dim size')
parser.add_argument(
'--model_dir', type=str, default='model_recall20', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='number of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--step_num', type=int, default=1000, help='Number of steps')
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run the task with continuous evaluation logs.')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
if args.enable_ce:
fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED
hid_size = args.hid_size
train_dir = args.train_dir
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
print("use_cuda:", use_cuda, "parallel:", parallel)
batch_size = args.batch_size
vocab_size, train_reader = utils.prepare_data(
train_dir, vocab_path, batch_size=batch_size * get_cards(args),\
buffer_size=1000, word_freq_threshold=0, is_train=True)
# Train program
src_wordseq, dst_wordseq, avg_cost, acc = net.all_vocab_network(
vocab_size=vocab_size, hid_size=hid_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
ce_info = []
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
ce_info.append(newest_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
if args.enable_ce and i > args.step_num:
break
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
# only for ce
if args.enable_ce:
ce_ppl = 0
try:
ce_ppl = ce_info[-2]
except:
print("ce info error")
epoch_idx = args.pass_num
device = get_device(args)
if args.use_cuda:
gpu_num = device[1]
print("kpis\teach_pass_duration_gpu%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_ppl_gpu%s\t%s" %
(gpu_num, ce_ppl))
else:
cpu_num = device[1]
threads_num = device[2]
print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, total_time / epoch_idx))
print("kpis\ttrain_ppl_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, ce_ppl))
print("finish training")
def get_device(args):
if args.use_cuda:
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", 1)
gpu_num = len(gpus.split(','))
return "gpu", gpu_num
else:
threads_num = os.environ.get('NUM_THREADS', 1)
cpu_num = os.environ.get('CPU_NUM', 1)
return "cpu", int(cpu_num), int(threads_num)
if __name__ == "__main__":
train()