From 93c20c2836d2ab14a274f29c84dbd0e0f853417d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Sep 2020 03:00:59 +0200 Subject: [PATCH 1/4] fix + test --- pytorch_lightning/trainer/training_loop.py | 9 ++++----- tests/models/test_grad_norm.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index b5f80672a30dd..3fcbbafcda428 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -375,13 +375,12 @@ def on_before_backward(self, batch_idx, optimizer): return grad_norm_dic def _track_gradient_norm(self, batch_idx): - grad_norm_dic = {} - if batch_idx % self.trainer.row_log_interval == 0: + grad_norm_dict = {} + if (batch_idx + 1) % self.trainer.row_log_interval == 0: if float(self.trainer.track_grad_norm) > 0: model = self.trainer.get_model() - grad_norm_dic = model.grad_norm( - self.trainer.track_grad_norm) - return grad_norm_dic + grad_norm_dict = model.grad_norm(self.trainer.track_grad_norm) + return grad_norm_dict def log_training_step_metrics(self, opt_closure_result, batch_callback_metrics, batch_log_metrics): # track callback metrics diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index 2e0d4454500f3..b39dbc936d5aa 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -1,4 +1,5 @@ import os +from unittest.mock import patch import numpy as np import pytest @@ -73,3 +74,23 @@ def test_grad_tracking(tmpdir, norm_type, rtol=5e-3): log, mod = [log[k] for k in common], [mod[k] for k in common] assert np.allclose(log, mod, rtol=rtol) + + +@pytest.mark.parametrize("row_log_interval", [1, 2, 3]) +def test_grad_tracking_interval(tmpdir, row_log_interval): + trainer = Trainer( + default_root_dir=tmpdir, + track_grad_norm=2, + row_log_interval=2, + max_steps=10, + ) + + with patch.object(trainer.logger, "log_metrics") as mocked: + model = EvalModelTemplate() + trainer.fit(model) + expected = trainer.global_step // 2 + count = 0 + for _, kwargs in mocked.call_args_list: + if "grad_2.0_norm_c_d1.weight" in kwargs.get("metrics", {}): + count += 1 + assert count == expected From 77a6c5f923dc9a78684f8464a24867276693bedf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Sep 2020 03:20:46 +0200 Subject: [PATCH 2/4] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f82cbe974ca8..05dc417d1d410 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed getting `experiment_id` from MLFlow only once instead of each training loop ([#3394](https://github.com/PyTorchLightning/pytorch-lightning/pull/3394)) +- Fixed gradient norm tracking for `row_log_interval > 1` ([#3489](https://github.com/PyTorchLightning/pytorch-lightning/pull/3489)) + ## [0.9.0] - YYYY-MM-DD ### Added From d955d5178460883dc0b2bab7b0df411bdf6adfb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Sep 2020 05:13:38 +0200 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Tim Chard --- tests/models/test_grad_norm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index b39dbc936d5aa..b7c06d0449dcc 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -81,14 +81,14 @@ def test_grad_tracking_interval(tmpdir, row_log_interval): trainer = Trainer( default_root_dir=tmpdir, track_grad_norm=2, - row_log_interval=2, + row_log_interval=row_log_interval, max_steps=10, ) with patch.object(trainer.logger, "log_metrics") as mocked: model = EvalModelTemplate() trainer.fit(model) - expected = trainer.global_step // 2 + expected = trainer.global_step // row_log_interval count = 0 for _, kwargs in mocked.call_args_list: if "grad_2.0_norm_c_d1.weight" in kwargs.get("metrics", {}): From ef9e43035e3b976aeabebd086f174cb7d8af3ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Sep 2020 05:46:13 +0200 Subject: [PATCH 4/4] improve test --- tests/models/test_grad_norm.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index b7c06d0449dcc..0e8dece3e070a 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -78,6 +78,7 @@ def test_grad_tracking(tmpdir, norm_type, rtol=5e-3): @pytest.mark.parametrize("row_log_interval", [1, 2, 3]) def test_grad_tracking_interval(tmpdir, row_log_interval): + """ Test that gradient norms get tracked in the right interval and that everytime the same keys get logged. """ trainer = Trainer( default_root_dir=tmpdir, track_grad_norm=2, @@ -89,8 +90,12 @@ def test_grad_tracking_interval(tmpdir, row_log_interval): model = EvalModelTemplate() trainer.fit(model) expected = trainer.global_step // row_log_interval - count = 0 + grad_norm_dicts = [] for _, kwargs in mocked.call_args_list: - if "grad_2.0_norm_c_d1.weight" in kwargs.get("metrics", {}): - count += 1 - assert count == expected + metrics = kwargs.get("metrics", {}) + grad_norm_dict = {k: v for k, v in metrics.items() if k.startswith("grad_")} + if grad_norm_dict: + grad_norm_dicts.append(grad_norm_dict) + + assert len(grad_norm_dicts) == expected + assert all(grad_norm_dicts[0].keys() == g.keys() for g in grad_norm_dicts)