Skip to content

Commit

Permalink
Update run_translation_no_trainer.py (huggingface#18637)
Browse files Browse the repository at this point in the history
* Update run_translation_no_trainer.py

found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint

* fixs `no_decay` and `resume_step` issue

1. change `no_decay` list
2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
  • Loading branch information
zhoutang776 authored and oneraghavan committed Sep 26, 2022
1 parent b2ebd12 commit ac387ee
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
13 changes: 10 additions & 3 deletions examples/pytorch/language-modeling/run_clm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ def group_texts(examples):

# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
no_decay = ["bias", "layer_norm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
Expand Down Expand Up @@ -558,10 +558,15 @@ def group_texts(examples):
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None
else:
resume_step = int(training_difference.replace("step_", ""))
# need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader)

# update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
completed_steps = starting_epoch * num_update_steps_per_epoch

for epoch in range(starting_epoch, args.num_train_epochs):
model.train()
if args.with_tracking:
Expand All @@ -570,7 +575,9 @@ def group_texts(examples):
# We need to skip steps until we reach the resumed step
if args.resume_from_checkpoint and epoch == starting_epoch:
if resume_step is not None and step < resume_step:
completed_steps += 1
if step % args.gradient_accumulation_steps == 0:
progress_bar.update(1)
completed_steps += 1
continue

with accelerator.accumulate(model):
Expand Down
11 changes: 9 additions & 2 deletions examples/pytorch/language-modeling/run_mlm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,10 +602,15 @@ def group_texts(examples):
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None
else:
resume_step = int(training_difference.replace("step_", ""))
# need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader)

# update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
completed_steps = starting_epoch * num_update_steps_per_epoch

for epoch in range(starting_epoch, args.num_train_epochs):
model.train()
if args.with_tracking:
Expand All @@ -614,7 +619,9 @@ def group_texts(examples):
# We need to skip steps until we reach the resumed step
if args.resume_from_checkpoint and epoch == starting_epoch:
if resume_step is not None and step < resume_step:
completed_steps += 1
if step % args.gradient_accumulation_steps == 0:
progress_bar.update(1)
completed_steps += 1
continue

with accelerator.accumulate(model):
Expand Down
13 changes: 10 additions & 3 deletions examples/pytorch/translation/run_translation_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def preprocess_function(examples):

# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
Expand Down Expand Up @@ -607,10 +607,15 @@ def postprocess_text(preds, labels):
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None
else:
resume_step = int(training_difference.replace("step_", ""))
# need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader)

# update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
completed_steps = starting_epoch * num_update_steps_per_epoch

for epoch in range(starting_epoch, args.num_train_epochs):
model.train()
if args.with_tracking:
Expand All @@ -619,7 +624,9 @@ def postprocess_text(preds, labels):
# We need to skip steps until we reach the resumed step
if args.resume_from_checkpoint and epoch == starting_epoch:
if resume_step is not None and step < resume_step:
completed_steps += 1
if step % args.gradient_accumulation_steps == 0:
progress_bar.update(1)
completed_steps += 1
continue
outputs = model(**batch)
loss = outputs.loss
Expand Down

0 comments on commit ac387ee

Please sign in to comment.