-
Notifications
You must be signed in to change notification settings - Fork 13
/
run_t5.sh
132 lines (111 loc) · 3.23 KB
/
run_t5.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#! /bin/bash
GPUS_PER_NODE=4
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=8888
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
export DLWS_NUM_WORKER=${NNODES}
export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
WORKING_DIR=${HOME}/Megatron-LM-3D
DATA_DIR=${HOME}/t5_data
DATA_PATH="${DATA_DIR}/pretrain_data/baike_small_document"
VOCAB_PATH="${DATA_DIR}/bpe_new"
CHECKPOINT_PATH=checkpoints/t5_test
config_json="${WORKING_DIR}/ds_config_t5.json"
# Megatron Model Parallelism
mp_size=2
# DeepSpeed Pipeline parallelism
pp_size=1
NLAYERS=2
NHIDDEN=128
BATCHSIZE=4
GAS=16
LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
#Actication Checkpointing and Contigious Memory
checkpoint_activations=false
chkp_layers=1
PA=true
PA_CPU=false
CC=true
SYNCHRONIZE=true
PROFILE=false
t5_options=" \
--model-parallel-size ${mp_size} \
--pipe-parallel-size ${pp_size} \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--kv-hidden-size 16 \
--ff-hidden-size 256 \
--num-attention-heads 8 \
--enc-seq-length 1024\
--dec-seq-length 384\
--max-position-embeddings 1024 \
--batch-size $BATCHSIZE \
--gas $GAS \
--train-iters 320000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 1.5e-4 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup 0.01 \
--log-interval 1 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
--fp16 \
--tensorboard-dir ${LOGDIR}
"
deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
"
if [ "${contigious_gradients}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-contigious-gradients"
fi
if [ "${reduce_scatter}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-reduce-scatter"
fi
if ["${checkpoint_activations}" = "true"]; then
chkp_opt=" \
--checkpoint-activations \
--checkpoint-num-layers ${chkp_layers}"
if [ "${PA}" = "true" ]; then
chkp_opt="${chkp_opt} \
--partition-activations"
fi
if [ "${PA_CPU}" = "true" ]; then
chkp_opt="${chkp_opt} \
--checkpoint-in-cpu"
fi
if [ "${SYNCHRONIZE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--synchronize-each-layer"
fi
if [ "${CC}" = "true" ]; then
chkp_opt="${chkp_opt} \
--contigious-checkpointing"
fi
if [ "${PROFILE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--profile-backward"
fi
else
chkp_opt = " "
fi
full_options="${t5_options} ${deepspeed_options} ${chkp_opt}"
run_cmd="deepspeed --master_port ${MASTER_PORT} -i node1:4,5,6,7 --hostfile hostfile pretrain_t5.py $@ ${full_options}"
echo ${run_cmd}
eval ${run_cmd}
set +x