Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix] Fix the bug that the training log and evaluating log are mixed #1252

Merged
merged 10 commits into from
Aug 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions mmcv/runner/hooks/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from mmcv.utils import is_seq_of
from .hook import Hook
from .logger import LoggerHook


class EvalHook(Hook):
Expand Down Expand Up @@ -211,19 +212,31 @@ def before_train_epoch(self, runner):

def after_train_iter(self, runner):
"""Called after every training iter to evaluate the results."""
if not self.by_epoch:
if not self.by_epoch and self._should_evaluate(runner):
# Because the priority of EvalHook is higher than LoggerHook, the
# training log and the evaluating log are mixed. Therefore,
# we need to dump the training log and clear it before evaluating
# log is generated. In addition, this problem will only appear in
# `IterBasedRunner` whose `self.by_epoch` is False, because
# `EpochBasedRunner` whose `self.by_epoch` is True calls
# `_do_evaluate` in `after_train_epoch` stage, and at this stage
# the training log has been printed, so it will not cause any
# problem. more details at
# https://github.com/open-mmlab/mmsegmentation/issues/694
for hook in runner._hooks:
if isinstance(hook, LoggerHook):
hook.after_train_iter(runner)
runner.log_buffer.clear()

self._do_evaluate(runner)

def after_train_epoch(self, runner):
"""Called after every training epoch to evaluate the results."""
if self.by_epoch:
if self.by_epoch and self._should_evaluate(runner):
self._do_evaluate(runner)

def _do_evaluate(self, runner):
"""perform evaluation and save ckpt."""
if not self._should_evaluate(runner):
return

results = self.test_fn(runner.model, self.dataloader)
runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
key_score = self.evaluate(runner, results)
Expand Down Expand Up @@ -418,9 +431,6 @@ def _do_evaluate(self, runner):
dist.broadcast(module.running_var, 0)
dist.broadcast(module.running_mean, 0)

if not self._should_evaluate(runner):
return

tmpdir = self.tmpdir
if tmpdir is None:
tmpdir = osp.join(runner.work_dir, '.eval_hook')
Expand Down
51 changes: 42 additions & 9 deletions tests/test_runner/test_eval_hook.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os.path as osp
import tempfile
import unittest.mock as mock
Expand All @@ -7,13 +8,14 @@
import pytest
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from mmcv.runner import DistEvalHook as BaseDistEvalHook
from mmcv.runner import EpochBasedRunner
from mmcv.runner import EvalHook as BaseEvalHook
from mmcv.runner import IterBasedRunner
from mmcv.utils import get_logger
from mmcv.utils import get_logger, scandir


class ExampleDataset(Dataset):
Expand Down Expand Up @@ -48,18 +50,16 @@ class Model(nn.Module):

def __init__(self):
super().__init__()
self.linear = nn.Linear(2, 1)
self.param = nn.Parameter(torch.tensor([1.0]))

def forward(self, x, **kwargs):
return x
return self.param * x

def train_step(self, data_batch, optimizer, **kwargs):
if not isinstance(data_batch, dict):
data_batch = dict(x=data_batch)
return data_batch
return {'loss': torch.sum(self(data_batch['x']))}

def val_step(self, x, optimizer, **kwargs):
return dict(loss=self(x))
def val_step(self, data_batch, optimizer, **kwargs):
return {'loss': torch.sum(self(data_batch['x']))}


def _build_epoch_runner():
Expand Down Expand Up @@ -307,7 +307,7 @@ def test_eval_hook():
(_build_iter_runner, False)])
def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
# create dummy data
dataloader = DataLoader(torch.ones((5, 2)))
dataloader = DataLoader(EvalDataset())

# 0.1. dataloader is not a DataLoader object
with pytest.raises(TypeError):
Expand Down Expand Up @@ -389,3 +389,36 @@ def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
runner._iter = 1
runner.run([dataloader], [('train', 1)], 3)
assert evalhook.evaluate.call_count == 2 # after epoch 2 & 3


@pytest.mark.parametrize('runner,by_epoch,eval_hook_priority',
[(EpochBasedRunner, True, 'NORMAL'),
(EpochBasedRunner, True, 'LOW'),
(IterBasedRunner, False, 'LOW')])
def test_logger(runner, by_epoch, eval_hook_priority):
loader = DataLoader(EvalDataset())
model = Model()
data_loader = DataLoader(EvalDataset())
eval_hook = EvalHook(
data_loader, interval=1, by_epoch=by_epoch, save_best='acc')

with tempfile.TemporaryDirectory() as tmpdir:
logger = get_logger('test_logger')
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
runner = EpochBasedRunner(
model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger)
runner.register_logger_hooks(
dict(
interval=1,
hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)]))
runner.register_timer_hook(dict(type='IterTimerHook'))
runner.register_hook(eval_hook, priority=eval_hook_priority)
runner.run([loader], [('train', 1)], 1)

path = osp.join(tmpdir, next(scandir(tmpdir, '.json')))
with open(path) as fr:
fr.readline() # skip first line which is hook_msg
train_log = json.loads(fr.readline())
assert train_log['mode'] == 'train' and 'time' in train_log
val_log = json.loads(fr.readline())
assert val_log['mode'] == 'val' and 'time' not in val_log