From 4bfc7bdd36ecd29b5261cf0951b5e393fe32748c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 25 Aug 2021 20:40:04 +0200 Subject: [PATCH] fix global step and epoch counters on failed checkpointing --- pytorch_lightning/trainer/trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 19ccf3935a168..c22b776abe89d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1379,4 +1379,9 @@ def _on_exception(self): return # save a checkpoint for fault tolerant training. we don't use `log_dir` to minimize the chances of failure. file_path = os.path.join(self.default_root_dir, ".pl_auto_save.ckpt") + # CheckpointConnector.dump_checkpoint will bump the counters, but we counteract it here since we failed + # and have not actually completed the epoch/step. + # TODO: remove when FitLoop and TrainingEpochLoop do no longer depend on these counters for done() condition + self.fit_loop.global_step -= 1 + self.fit_loop.current_epoch -= 1 self.save_checkpoint(file_path)