Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert/Fix: epoch indexing from 1, to be from 0 #2289

Merged
merged 7 commits into from
Jun 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

- Changed epoch indexing from 0 instead of 1 ([#2289](https://github.com/PyTorchLightning/pytorch-lightning/pull/2289))

### Deprecated

### Removed
Expand Down
12 changes: 4 additions & 8 deletions pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ class GradientAccumulationScheduler(Callback):
Args:
scheduling: scheduling in format {epoch: accumulation_factor}

.. warning::
Epochs indexing starts from "1" until v0.6.x,
but will start from "0" in v0.8.0.

Example::

>>> from pytorch_lightning import Trainer
Expand All @@ -42,13 +38,13 @@ def __init__(self, scheduling: dict):

for key in scheduling:
if not isinstance(key, int) or not isinstance(scheduling[key], int):
raise TypeError("All epochs and accumulation factor must be integers")
raise TypeError("All epoches and accumulation factor must be integers")
Borda marked this conversation as resolved.
Show resolved Hide resolved

minimal_epoch = min(scheduling.keys())
if minimal_epoch < 1:
if minimal_epoch < 0:
raise IndexError(f"Epochs indexing from 1, epoch {minimal_epoch} cannot be interpreted correct")
if minimal_epoch != 1: # if user didnt define first epoch accumulation factor
scheduling.update({1: 1})
if minimal_epoch != 0: # if user didnt define first epoch accumulation factor
scheduling.update({0: 1})

self.scheduling = scheduling
self.epochs = sorted(scheduling.keys())
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/callbacks/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def total_val_batches(self) -> int:
if trainer.fast_dev_run and trainer.val_dataloaders is not None:
total_val_batches = len(trainer.val_dataloaders)
elif not self.trainer.disable_validation:
is_val_epoch = trainer.current_epoch % trainer.check_val_every_n_epoch == 0
is_val_epoch = (trainer.current_epoch + 1) % trainer.check_val_every_n_epoch == 0
Borda marked this conversation as resolved.
Show resolved Hide resolved
total_val_batches = sum(trainer.num_val_batches) if is_val_epoch else 0
return total_val_batches

Expand Down Expand Up @@ -318,7 +318,7 @@ def on_epoch_start(self, trainer, pl_module):
total_batches = total_train_batches + total_val_batches
if not self.main_progress_bar.disable:
self.main_progress_bar.reset(convert_inf(total_batches))
self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch}')
self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch + 1}')

def on_batch_end(self, trainer, pl_module):
super().on_batch_end(trainer, pl_module)
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/trainer/distrib_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):

# AMP
# run through amp wrapper before going to distributed DP
# TODO: remove in v0.8.0
# TODO: remove with dropping NVIDIA AMP support
if self.use_amp and not self.use_native_amp:
model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
self.optimizers = optimizers
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/distrib_parts.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def single_gpu_train(self, model):
# allow for lr schedulers as well
self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

# TODO: update for 0.8.0
# TODO: remove with dropping NVIDIA AMP support
if self.use_amp and not self.use_native_amp:
# An example
model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
Expand Down Expand Up @@ -240,7 +240,7 @@ def dp_train(self, model):
# wrap the user's forward in autocast and give it back at the end
model.forward = torch.cuda.amp.autocast()(model.forward)

# TODO: remove in v0.8.0
# TODO: remove with dropping NVIDIA AMP support
# check for this bug (amp + dp + !01 doesn't work)
# https://github.com/NVIDIA/apex/issues/227
if self.use_dp and self.use_amp and not self.use_native_amp:
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/trainer/training_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
structured dictionary
"""
checkpoint = {
'epoch': self.current_epoch,
'epoch': self.current_epoch + 1,
'global_step': self.global_step + 1,
'pytorch-ligthning_version': pytorch_lightning.__version__,
}
Expand Down
8 changes: 4 additions & 4 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,8 @@ def train(self):
model.on_train_start()

try:
# run all epochs from actual + 1 till the maximal
for epoch in range(self.current_epoch + 1, self.max_epochs + 1):
# run all epochs
for epoch in range(self.current_epoch, self.max_epochs):
# reset train dataloader
if self.reload_dataloaders_every_epoch:
self.reset_train_dataloader(model)
Expand Down Expand Up @@ -382,7 +382,7 @@ def train(self):
self.update_learning_rates(interval='epoch')

# early stopping
met_min_epochs = epoch >= self.min_epochs
met_min_epochs = epoch >= self.min_epochs - 1
met_min_steps = self.global_step >= self.min_steps if self.min_steps else True

# TODO wrap this logic into the callback
Expand Down Expand Up @@ -476,7 +476,7 @@ def run_training_epoch(self):
# RUN VAL STEP
# ---------------
is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0
can_check_epoch = self.current_epoch % self.check_val_every_n_epoch == 0
can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0
Borda marked this conversation as resolved.
Show resolved Hide resolved
can_check_val = not self.disable_validation and can_check_epoch
should_check_val = is_val_check_batch or early_stop_epoch
should_check_val = should_check_val or (is_last_batch and self.val_check_batch == float('inf'))
Expand Down
2 changes: 1 addition & 1 deletion tests/callbacks/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def training_step(self, *args, **kwargs):
result = trainer.fit(model)

assert result == 1, 'training failed to complete'
assert trainer.current_epoch <= trainer.max_epochs
assert trainer.current_epoch < trainer.max_epochs


def test_pickling(tmpdir):
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def training_epoch_end(self, outputs):
# a metric shared in both methods gets overwritten by epoch_end
assert metrics['shared_metric'] == 111
# metrics are kept after each epoch
for i in range(1, num_epochs + 1):
for i in range(num_epochs):
assert metrics[f'epoch_metric_{i}'] == i


Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def test_dp_resume(tmpdir):
result = trainer.fit(model)

# track epoch before saving. Increment since we finished the current epoch, don't want to rerun
real_global_epoch = trainer.current_epoch
real_global_epoch = trainer.current_epoch + 1
Borda marked this conversation as resolved.
Show resolved Hide resolved

# correct result and ok accuracy
assert result == 1, 'amp + dp model failed to complete'
Expand Down
12 changes: 8 additions & 4 deletions tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,17 @@ def test_gradient_accumulation_scheduling(tmpdir):

# test incorrect configs
with pytest.raises(IndexError):
assert Trainer(accumulate_grad_batches={0: 3, 1: 4, 4: 6})
assert Trainer(accumulate_grad_batches={-1: 3, 1: 4, 4: 6})
with pytest.raises(IndexError):
assert Trainer(accumulate_grad_batches={-2: 3})

with pytest.raises(TypeError):
assert Trainer(accumulate_grad_batches={})
with pytest.raises(TypeError):
assert Trainer(accumulate_grad_batches=[[2, 3], [4, 6]])
with pytest.raises(TypeError):
assert Trainer(accumulate_grad_batches={1: 2, 3.: 4})
with pytest.raises(TypeError):
assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5})

# test optimizer call freq matches scheduler
Expand Down Expand Up @@ -451,7 +455,7 @@ def test_trainer_max_steps_and_epochs(tmpdir):

# check training stopped at max_epochs
assert trainer.global_step == num_train_samples * trainer.max_epochs
assert trainer.current_epoch == trainer.max_epochs, "Model did not stop at max_epochs"
assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs"


def test_trainer_min_steps_and_epochs(tmpdir):
Expand Down Expand Up @@ -619,7 +623,7 @@ def validation_epoch_end(self, *args, **kwargs):

# check that limit_val_batches=0 turns off validation
assert result == 1, 'training failed to complete'
assert trainer.current_epoch == 2
assert trainer.current_epoch == 1
assert not model.validation_step_invoked, \
'`validation_step` should not run when `limit_val_batches=0`'
assert not model.validation_epoch_end_invoked, \
Expand All @@ -632,7 +636,7 @@ def validation_epoch_end(self, *args, **kwargs):
result = trainer.fit(model)

assert result == 1, 'training failed to complete'
assert trainer.current_epoch == 1
assert trainer.current_epoch == 0
assert model.validation_step_invoked, \
'did not run `validation_step` with `fast_dev_run=True`'
assert model.validation_epoch_end_invoked, \
Expand Down