Update run_translation_no_trainer.py (huggingface#18637)

* Update run_translation_no_trainer.py found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint * fixs `no_decay` and `resume_step` issue 1. change `no_decay` list 2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
oneraghavan · Sep 26, 2022 · ac387ee · ac387ee
1 parent b2ebd12
commit ac387ee
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 8 deletions.
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -464,7 +464,7 @@ def group_texts(examples):
 
  # Optimizer
  # Split weights in two groups, one with weight decay and the other not.
- no_decay = ["bias", "LayerNorm.weight"]
+ no_decay = ["bias", "layer_norm.weight"]
  optimizer_grouped_parameters = [
  {
  "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -558,10 +558,15 @@ def group_texts(examples):
  starting_epoch = int(training_difference.replace("epoch_", "")) + 1
  resume_step = None
  else:
- resume_step = int(training_difference.replace("step_", ""))
+ # need to multiply `gradient_accumulation_steps` to reflect real steps
+ resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
  starting_epoch = resume_step // len(train_dataloader)
  resume_step -= starting_epoch * len(train_dataloader)
 
+ # update the progress_bar if load from checkpoint
+ progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+ completed_steps = starting_epoch * num_update_steps_per_epoch
+
  for epoch in range(starting_epoch, args.num_train_epochs):
  model.train()
  if args.with_tracking:
@@ -570,7 +575,9 @@ def group_texts(examples):
  # We need to skip steps until we reach the resumed step
  if args.resume_from_checkpoint and epoch == starting_epoch:
  if resume_step is not None and step < resume_step:
- completed_steps += 1
+ if step % args.gradient_accumulation_steps == 0:
+ progress_bar.update(1)
+ completed_steps += 1
  continue
 
  with accelerator.accumulate(model):

diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -602,10 +602,15 @@ def group_texts(examples):
  starting_epoch = int(training_difference.replace("epoch_", "")) + 1
  resume_step = None
  else:
- resume_step = int(training_difference.replace("step_", ""))
+ # need to multiply `gradient_accumulation_steps` to reflect real steps
+ resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
  starting_epoch = resume_step // len(train_dataloader)
  resume_step -= starting_epoch * len(train_dataloader)
 
+ # update the progress_bar if load from checkpoint
+ progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+ completed_steps = starting_epoch * num_update_steps_per_epoch
+
  for epoch in range(starting_epoch, args.num_train_epochs):
  model.train()
  if args.with_tracking:
@@ -614,7 +619,9 @@ def group_texts(examples):
  # We need to skip steps until we reach the resumed step
  if args.resume_from_checkpoint and epoch == starting_epoch:
  if resume_step is not None and step < resume_step:
- completed_steps += 1
+ if step % args.gradient_accumulation_steps == 0:
+ progress_bar.update(1)
+ completed_steps += 1
  continue
 
  with accelerator.accumulate(model):

diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -510,7 +510,7 @@ def preprocess_function(examples):
 
  # Optimizer
  # Split weights in two groups, one with weight decay and the other not.
- no_decay = ["bias", "LayerNorm.weight"]
+ no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
  optimizer_grouped_parameters = [
  {
  "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -607,10 +607,15 @@ def postprocess_text(preds, labels):
  starting_epoch = int(training_difference.replace("epoch_", "")) + 1
  resume_step = None
  else:
- resume_step = int(training_difference.replace("step_", ""))
+ # need to multiply `gradient_accumulation_steps` to reflect real steps
+ resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
  starting_epoch = resume_step // len(train_dataloader)
  resume_step -= starting_epoch * len(train_dataloader)
 
+ # update the progress_bar if load from checkpoint
+ progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+ completed_steps = starting_epoch * num_update_steps_per_epoch
+
  for epoch in range(starting_epoch, args.num_train_epochs):
  model.train()
  if args.with_tracking:
@@ -619,7 +624,9 @@ def postprocess_text(preds, labels):
  # We need to skip steps until we reach the resumed step
  if args.resume_from_checkpoint and epoch == starting_epoch:
  if resume_step is not None and step < resume_step:
- completed_steps += 1
+ if step % args.gradient_accumulation_steps == 0:
+ progress_bar.update(1)
+ completed_steps += 1
  continue
  outputs = model(**batch)
  loss = outputs.loss