Skip to content

Commit

Permalink
[bugfix] Logging only on not should_accumulate() during training (#…
Browse files Browse the repository at this point in the history
…5417)

* resolve bug

* resolve tests

* update

* Update tests/loggers/test_tensorboard.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

(cherry picked from commit a053d75)
  • Loading branch information
tchaton authored and Borda committed Jan 26, 2021
1 parent 383cff0 commit 1f2b780
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def cache_training_step_metrics(self, opt_closure_result):
self._logged_metrics.update(logged_metrics_tmp)
self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp)

def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False):
def log_metrics(self, metrics, grad_norm_dic, step=None):
"""Logs the metric dict passed in.
If `step` parameter is None and `step` key is presented is metrics,
uses metrics["step"] as a step
Expand Down Expand Up @@ -234,11 +234,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=

elif step is None:
# added metrics by Lightning for convenience
if log_train_step_metrics:
step = self.trainer.total_batch_idx
else:
scalar_metrics['epoch'] = self.trainer.current_epoch
step = self.trainer.global_step
scalar_metrics['epoch'] = self.trainer.current_epoch
step = self.trainer.global_step

# log actual metrics
if self.trainer.logger is not None:
Expand Down Expand Up @@ -622,6 +619,8 @@ def __gather_result_across_time_and_optimizers(self, epoch_output):
return gathered_epoch_outputs

def log_train_step_metrics(self, batch_output):
if self.trainer.train_loop.should_accumulate() and self.trainer.train_loop.automatic_optimization:
return
_, batch_log_metrics = self.cached_results.update_logger_connector()
# when metrics should be logged
if self.should_update_logs or self.trainer.fast_dev_run is True:
Expand All @@ -630,5 +629,5 @@ def log_train_step_metrics(self, batch_output):
if grad_norm_dic is None:
grad_norm_dic = {}
if len(batch_log_metrics) > 0 or len(grad_norm_dic) > 0:
self.log_metrics(batch_log_metrics, grad_norm_dic, log_train_step_metrics=True)
self.log_metrics(batch_log_metrics, grad_norm_dic)
self._callback_metrics.update(batch_log_metrics)
4 changes: 2 additions & 2 deletions tests/loggers/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,15 +142,15 @@ def log_metrics(self, metrics, step):
if logger_class == TensorBoardLogger:
expected = [
(0, ['hp_metric']),
(0, ['train_some_val']),
(0, ['epoch', 'train_some_val']),
(0, ['early_stop_on', 'epoch', 'val_loss']),
(0, ['hp_metric']),
(1, ['epoch', 'test_loss'])
]
assert log_metric_names == expected
else:
expected = [
(0, ['train_some_val']),
(0, ['epoch', 'train_some_val']),
(0, ['early_stop_on', 'epoch', 'val_loss']),
(1, ['epoch', 'test_loss'])
]
Expand Down
19 changes: 11 additions & 8 deletions tests/loggers/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,19 +213,22 @@ def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmp
Tests to ensure that tensorboard log properly when accumulated_gradients > 1
"""
class TestModel(BoringModel):
_count = 0
_indexes = []

def __init__(self):
super().__init__()
self._count = 0
self._indexes = []

def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log('count', self._count, on_step=True, on_epoch=True)
self.log('loss', loss, on_step=True, on_epoch=True)

if self.trainer.logger_connector.should_update_logs:
self._indexes.append(self._count)
if not self.trainer.train_loop.should_accumulate():
if self.trainer.logger_connector.should_update_logs:
self._indexes.append(self.trainer.global_step)

self._count += 1
return loss

def validation_step(self, batch, batch_idx):
Expand All @@ -245,20 +248,20 @@ def configure_optimizers(self):

logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False)

accumulate_grad_batches = 2
trainer = Trainer(
default_root_dir=tmpdir,
limit_train_batches=12,
limit_val_batches=12,
limit_val_batches=0,
max_epochs=3,
gpus=0,
accumulate_grad_batches=accumulate_grad_batches,
accumulate_grad_batches=2,
logger=[logger_0],
log_every_n_steps=3,
)
trainer.fit(model)

mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]]
assert mock_count_epochs == expected

mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]]
assert model._indexes == mock_count_steps
6 changes: 5 additions & 1 deletion tests/trainer/logging_process/test_train_loop_logging_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@
import numpy as np
import pytest
import torch
from torch.utils.data import Dataset
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from torchvision.datasets.mnist import MNIST

import pytorch_lightning as pl
from pytorch_lightning import callbacks, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.loggers import WandbLogger
from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset
from tests.base.deterministic_model import DeterministicModel

Expand Down

0 comments on commit 1f2b780

Please sign in to comment.