diff --git a/CHANGELOG.md b/CHANGELOG.md index 2649abc02d54e..51e56e7c1b18b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- Changed epoch indexing from 0 instead of 1 ([#2289](https://github.com/PyTorchLightning/pytorch-lightning/pull/2289)) + ### Deprecated ### Removed diff --git a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py index 15b67e6c33560..997574644072d 100644 --- a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py +++ b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py @@ -17,10 +17,6 @@ class GradientAccumulationScheduler(Callback): Args: scheduling: scheduling in format {epoch: accumulation_factor} - .. warning:: - Epochs indexing starts from "1" until v0.6.x, - but will start from "0" in v0.8.0. - Example:: >>> from pytorch_lightning import Trainer @@ -42,13 +38,13 @@ def __init__(self, scheduling: dict): for key in scheduling: if not isinstance(key, int) or not isinstance(scheduling[key], int): - raise TypeError("All epochs and accumulation factor must be integers") + raise TypeError("All epoches and accumulation factor must be integers") minimal_epoch = min(scheduling.keys()) - if minimal_epoch < 1: + if minimal_epoch < 0: raise IndexError(f"Epochs indexing from 1, epoch {minimal_epoch} cannot be interpreted correct") - if minimal_epoch != 1: # if user didnt define first epoch accumulation factor - scheduling.update({1: 1}) + if minimal_epoch != 0: # if user didnt define first epoch accumulation factor + scheduling.update({0: 1}) self.scheduling = scheduling self.epochs = sorted(scheduling.keys()) diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index 715650228c181..358575273f945 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -96,7 +96,7 @@ def total_val_batches(self) -> int: if trainer.fast_dev_run and trainer.val_dataloaders is not None: total_val_batches = len(trainer.val_dataloaders) elif not self.trainer.disable_validation: - is_val_epoch = trainer.current_epoch % trainer.check_val_every_n_epoch == 0 + is_val_epoch = (trainer.current_epoch + 1) % trainer.check_val_every_n_epoch == 0 total_val_batches = sum(trainer.num_val_batches) if is_val_epoch else 0 return total_val_batches @@ -318,7 +318,7 @@ def on_epoch_start(self, trainer, pl_module): total_batches = total_train_batches + total_val_batches if not self.main_progress_bar.disable: self.main_progress_bar.reset(convert_inf(total_batches)) - self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch}') + self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch + 1}') def on_batch_end(self, trainer, pl_module): super().on_batch_end(trainer, pl_module) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 4de06127feb5a..8818b899a21cd 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -518,7 +518,7 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0): # AMP # run through amp wrapper before going to distributed DP - # TODO: remove in v0.8.0 + # TODO: remove with dropping NVIDIA AMP support if self.use_amp and not self.use_native_amp: model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index df1c9afe43933..5aad0f25ce700 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -174,7 +174,7 @@ def single_gpu_train(self, model): # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) - # TODO: update for 0.8.0 + # TODO: remove with dropping NVIDIA AMP support if self.use_amp and not self.use_native_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) @@ -240,7 +240,7 @@ def dp_train(self, model): # wrap the user's forward in autocast and give it back at the end model.forward = torch.cuda.amp.autocast()(model.forward) - # TODO: remove in v0.8.0 + # TODO: remove with dropping NVIDIA AMP support # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp and not self.use_native_amp: diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 6350ded8fc67f..605efc0bbcc9c 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -323,7 +323,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: structured dictionary """ checkpoint = { - 'epoch': self.current_epoch, + 'epoch': self.current_epoch + 1, 'global_step': self.global_step + 1, 'pytorch-ligthning_version': pytorch_lightning.__version__, } diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 59a3d4122f89e..a7608e95019f3 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -346,8 +346,8 @@ def train(self): model.on_train_start() try: - # run all epochs from actual + 1 till the maximal - for epoch in range(self.current_epoch + 1, self.max_epochs + 1): + # run all epochs + for epoch in range(self.current_epoch, self.max_epochs): # reset train dataloader if self.reload_dataloaders_every_epoch: self.reset_train_dataloader(model) @@ -382,7 +382,7 @@ def train(self): self.update_learning_rates(interval='epoch') # early stopping - met_min_epochs = epoch >= self.min_epochs + met_min_epochs = epoch >= self.min_epochs - 1 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True # TODO wrap this logic into the callback @@ -476,7 +476,7 @@ def run_training_epoch(self): # RUN VAL STEP # --------------- is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0 - can_check_epoch = self.current_epoch % self.check_val_every_n_epoch == 0 + can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0 can_check_val = not self.disable_validation and can_check_epoch should_check_val = is_val_check_batch or early_stop_epoch should_check_val = should_check_val or (is_last_batch and self.val_check_batch == float('inf')) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index d2bafe6d7c991..9405730dc9698 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -286,7 +286,7 @@ def training_step(self, *args, **kwargs): result = trainer.fit(model) assert result == 1, 'training failed to complete' - assert trainer.current_epoch <= trainer.max_epochs + assert trainer.current_epoch < trainer.max_epochs def test_pickling(tmpdir): diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 820b30cacd2e7..78efbd35ff4da 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -68,7 +68,7 @@ def training_epoch_end(self, outputs): # a metric shared in both methods gets overwritten by epoch_end assert metrics['shared_metric'] == 111 # metrics are kept after each epoch - for i in range(1, num_epochs + 1): + for i in range(num_epochs): assert metrics[f'epoch_metric_{i}'] == i diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 5276521e0dadb..b907bc3c7114a 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -172,7 +172,7 @@ def test_dp_resume(tmpdir): result = trainer.fit(model) # track epoch before saving. Increment since we finished the current epoch, don't want to rerun - real_global_epoch = trainer.current_epoch + real_global_epoch = trainer.current_epoch + 1 # correct result and ok accuracy assert result == 1, 'amp + dp model failed to complete' diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 6574e5094011b..876bad6daa81b 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -107,13 +107,17 @@ def test_gradient_accumulation_scheduling(tmpdir): # test incorrect configs with pytest.raises(IndexError): - assert Trainer(accumulate_grad_batches={0: 3, 1: 4, 4: 6}) + assert Trainer(accumulate_grad_batches={-1: 3, 1: 4, 4: 6}) + with pytest.raises(IndexError): assert Trainer(accumulate_grad_batches={-2: 3}) with pytest.raises(TypeError): assert Trainer(accumulate_grad_batches={}) + with pytest.raises(TypeError): assert Trainer(accumulate_grad_batches=[[2, 3], [4, 6]]) + with pytest.raises(TypeError): assert Trainer(accumulate_grad_batches={1: 2, 3.: 4}) + with pytest.raises(TypeError): assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5}) # test optimizer call freq matches scheduler @@ -451,7 +455,7 @@ def test_trainer_max_steps_and_epochs(tmpdir): # check training stopped at max_epochs assert trainer.global_step == num_train_samples * trainer.max_epochs - assert trainer.current_epoch == trainer.max_epochs, "Model did not stop at max_epochs" + assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs" def test_trainer_min_steps_and_epochs(tmpdir): @@ -619,7 +623,7 @@ def validation_epoch_end(self, *args, **kwargs): # check that limit_val_batches=0 turns off validation assert result == 1, 'training failed to complete' - assert trainer.current_epoch == 2 + assert trainer.current_epoch == 1 assert not model.validation_step_invoked, \ '`validation_step` should not run when `limit_val_batches=0`' assert not model.validation_epoch_end_invoked, \ @@ -632,7 +636,7 @@ def validation_epoch_end(self, *args, **kwargs): result = trainer.fit(model) assert result == 1, 'training failed to complete' - assert trainer.current_epoch == 1 + assert trainer.current_epoch == 0 assert model.validation_step_invoked, \ 'did not run `validation_step` with `fast_dev_run=True`' assert model.validation_epoch_end_invoked, \