diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 3fd67d5fbf66e4..225b88a49440cc 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -464,7 +464,7 @@ def group_texts(examples): # Optimizer # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] + no_decay = ["bias", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], @@ -558,10 +558,15 @@ def group_texts(examples): starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: @@ -570,7 +575,9 @@ def group_texts(examples): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: - completed_steps += 1 + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 continue with accelerator.accumulate(model): diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 80dfcf9a9194e5..c5f6aad4126f5a 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -602,10 +602,15 @@ def group_texts(examples): starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: @@ -614,7 +619,9 @@ def group_texts(examples): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: - completed_steps += 1 + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 continue with accelerator.accumulate(model): diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index a6b0988f63d090..34c2ad1964090f 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -510,7 +510,7 @@ def preprocess_function(examples): # Optimizer # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] + no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], @@ -607,10 +607,15 @@ def postprocess_text(preds, labels): starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + # update the progress_bar if load from checkpoint + progress_bar.update(starting_epoch * num_update_steps_per_epoch) + completed_steps = starting_epoch * num_update_steps_per_epoch + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: @@ -619,7 +624,9 @@ def postprocess_text(preds, labels): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: - completed_steps += 1 + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss