Skip to content

Commit

Permalink
Update usage of apply_decay_param_fun in AdamW. (PaddlePaddle#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
guoshengCS authored Mar 8, 2021
1 parent 6e7d3d1 commit 2edaba0
Show file tree
Hide file tree
Showing 28 changed files with 227 additions and 144 deletions.
11 changes: 7 additions & 4 deletions benchmark/bert/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,15 +387,18 @@ def do_train(args):
with paddle.static.program_guard(main_program, startup_program):
lr_scheduler = LinearDecayWithWarmup(
args.learning_rate, num_training_steps, args.warmup_steps)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)
optimizer.minimize(loss)

# Create the metric pass for the validation
Expand Down
11 changes: 7 additions & 4 deletions benchmark/bert/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,15 +313,18 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
],
apply_decay_param_fun=lambda x: x in decay_params,
multi_precision=args.use_pure_fp16)
if worker_num == 1 and args.use_amp:
custom_black_list = (['lookup_table', 'lookup_table_v2']
Expand Down
11 changes: 7 additions & 4 deletions benchmark/bert/run_pretrain_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,15 +229,18 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
],
apply_decay_param_fun=lambda x: x in decay_params,
multi_precision=False)
if args.use_amp:
custom_black_list = (['lookup_table', 'lookup_table_v2']
Expand Down
11 changes: 7 additions & 4 deletions examples/benchmark/glue/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,17 +314,20 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
warmup)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
beta1=0.9,
beta2=0.999,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)

loss_fct = paddle.nn.loss.CrossEntropyLoss(
) if train_ds.label_list else paddle.nn.loss.MSELoss()
Expand Down
6 changes: 4 additions & 2 deletions examples/dialogue/dgu/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,17 @@ def train(args, model, train_data_loader, dev_data_loader, metric, rank):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps,
args.warmup_proportion)

decay_params_list = [
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params_list,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))
loss_fn = DGULossFunction(args.task_name)

Expand Down
6 changes: 4 additions & 2 deletions examples/dialogue/lic2021_baseline/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,17 @@ def main(args):

lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
args.warmup_steps)
decay_params_list = [
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params_list,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

step = 0
Expand Down
10 changes: 7 additions & 3 deletions examples/information_extraction/DuIE/run_duie.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,17 @@ def do_train():
num_training_steps = steps_by_epoch * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_ratio)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])])
apply_decay_param_fun=lambda x: x in decay_params)

# Starts training.
global_step = 0
Expand Down
13 changes: 8 additions & 5 deletions examples/information_extraction/msra_ner/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def do_train(args):
batchify_fn = lambda samples, fn=Dict({
'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
'seq_len': Stack(), # seq_len
'seq_len': Stack(), # seq_len
'labels': Pad(axis=0, pad_val=ignore_label) # label
}): fn(samples)

Expand Down Expand Up @@ -151,15 +151,18 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)

loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

Expand Down
11 changes: 7 additions & 4 deletions examples/language_model/bert/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,17 +368,20 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
warmup)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
beta1=0.9,
beta2=0.999,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)

loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
) else paddle.nn.loss.MSELoss()
Expand Down
15 changes: 9 additions & 6 deletions examples/language_model/bert/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,15 +312,18 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(
args.learning_rate, num_training_steps, args.warmup_steps, last_epoch=0)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)
if args.use_amp:
scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)

Expand All @@ -330,8 +333,8 @@ def do_train(args):
for epoch in range(args.num_train_epochs):
files = [
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
if os.path.isfile(os.path.join(args.input_dir, f)) and
"training" in f
if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in
f
]
files.sort()
num_files = len(files)
Expand Down
17 changes: 10 additions & 7 deletions examples/language_model/bigbird/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def do_train(args):
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

# Define the pretrain model and metric
# Define the pretrain model and metric
model = BigBirdForPretraining(
BigBirdModel(**model_class.pretrained_init_configuration[
args.model_name_or_path]))
Expand All @@ -157,21 +157,24 @@ def do_train(args):
if worker_num > 1:
model = paddle.DataParallel(model)

# Define learing_rate scheduler and optimizer
# Define learing_rate scheduler and optimizer
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, args.max_steps,
args.warmup_steps)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)

# Get bigbird config for generate random attention mask
# Get bigbird config for generate random attention mask
global config
config = BigBirdModel.pretrained_init_configuration[args.model_name_or_path]

Expand Down
11 changes: 7 additions & 4 deletions examples/language_model/electra/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,17 +314,20 @@ def do_train(args):
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
warmup)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
beta1=0.9,
beta2=0.999,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)

loss_fct = paddle.nn.loss.CrossEntropyLoss(
) if train_ds.label_list else paddle.nn.loss.MSELoss()
Expand Down
11 changes: 7 additions & 4 deletions examples/language_model/electra/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,16 +506,19 @@ def do_train(args):
args.warmup_steps)

clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
grad_clip=clip,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)
if args.use_amp:
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

Expand Down
11 changes: 7 additions & 4 deletions examples/language_model/gpt2/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,19 @@ def do_train(args):
if args.grad_clip > 0:
clip = paddle.nn.ClipGradByNorm(clip_norm=args.grad_clip)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
grad_clip=clip,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
apply_decay_param_fun=lambda x: x in decay_params)
if args.model_name_or_path not in pretrained_models_list:
opt_dict = paddle.load(
os.path.join(args.model_name_or_path, "model_state.pdopt"))
Expand Down
Loading

0 comments on commit 2edaba0

Please sign in to comment.