From 6ca50c4f6bae8929fd5d9a8bd4455b4005add5e7 Mon Sep 17 00:00:00 2001 From: Aleksander Wojnarowicz Date: Fri, 21 Apr 2023 15:52:29 +0200 Subject: [PATCH 1/6] upload checkpoint files from stream --- src/lightning/pytorch/loggers/neptune.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/loggers/neptune.py b/src/lightning/pytorch/loggers/neptune.py index 07c8acb1a04ea..47d1d59da8401 100644 --- a/src/lightning/pytorch/loggers/neptune.py +++ b/src/lightning/pytorch/loggers/neptune.py @@ -43,12 +43,14 @@ import neptune from neptune import Run from neptune.handler import Handler + from neptune.types import File from neptune.utils import stringify_unsupported elif _NEPTUNE_CLIENT_AVAILABLE: # <1.0 package structure import neptune.new as neptune from neptune.new import Run from neptune.new.handler import Handler + from neptune.new.types import File from neptune.new.utils import stringify_unsupported else: # needed for tests, mocks and function signatures @@ -488,7 +490,8 @@ def after_save_checkpoint(self, checkpoint_callback: Checkpoint) -> None: if hasattr(checkpoint_callback, "last_model_path") and checkpoint_callback.last_model_path: model_last_name = self._get_full_model_name(checkpoint_callback.last_model_path, checkpoint_callback) file_names.add(model_last_name) - self.run[f"{checkpoints_namespace}/{model_last_name}"].upload(checkpoint_callback.last_model_path) + with open(checkpoint_callback.last_model_path, "rb") as fp: + self.run[f"{checkpoints_namespace}/{model_last_name}"] = File.from_stream(fp) # save best k models if hasattr(checkpoint_callback, "best_k_models"): @@ -503,7 +506,8 @@ def after_save_checkpoint(self, checkpoint_callback: Checkpoint) -> None: model_name = self._get_full_model_name(checkpoint_callback.best_model_path, checkpoint_callback) file_names.add(model_name) - self.run[f"{checkpoints_namespace}/{model_name}"].upload(checkpoint_callback.best_model_path) + with open(checkpoint_callback.best_model_path, "rb") as fp: + self.run[f"{checkpoints_namespace}/{model_name}"] = File.from_stream(fp) # remove old models logged to experiment if they are not part of best k models at this point if self.run.exists(checkpoints_namespace): From 906275ec3d649905fb1d58f411a49109afb0c961 Mon Sep 17 00:00:00 2001 From: Aleksander Wojnarowicz Date: Mon, 24 Apr 2023 08:36:37 +0200 Subject: [PATCH 2/6] add File=None in imports + update the changelog --- src/lightning/pytorch/CHANGELOG.md | 2 +- src/lightning/pytorch/loggers/neptune.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index e80a8b31c4ae8..e4b500196042e 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -65,7 +65,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed issue where `Model.load_from_checkpoint("checkpoint.ckpt", map_location=map_location)` would always return model on CPU ([#17308](https://github.com/Lightning-AI/lightning/pull/17308)) - +- Fixed a potential bug with uploading model checkpoints to Neptune.ai by uploading files from stream ([#17430](https://github.com/Lightning-AI/lightning/pull/17430)) ## [2.0.1.post0] - 2023-04-11 diff --git a/src/lightning/pytorch/loggers/neptune.py b/src/lightning/pytorch/loggers/neptune.py index 47d1d59da8401..71f57fdfd2c16 100644 --- a/src/lightning/pytorch/loggers/neptune.py +++ b/src/lightning/pytorch/loggers/neptune.py @@ -54,7 +54,7 @@ from neptune.new.utils import stringify_unsupported else: # needed for tests, mocks and function signatures - neptune, Run, Handler, stringify_unsupported = None, None, None, None + neptune, Run, Handler, File, stringify_unsupported = None, None, None, None, None log = logging.getLogger(__name__) From 63af53e5b1c75d951cc38efe1bb13d28928e0be7 Mon Sep 17 00:00:00 2001 From: Aleksander Wojnarowicz Date: Wed, 26 Apr 2023 13:03:02 +0200 Subject: [PATCH 3/6] pathc neptune File object --- tests/tests_pytorch/loggers/test_all.py | 1 + tests/tests_pytorch/loggers/test_neptune.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index 01f350dea8096..73860fa7e29fa 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -48,6 +48,7 @@ mock.patch("lightning.pytorch.loggers.neptune._NEPTUNE_AVAILABLE", return_value=True), mock.patch("lightning.pytorch.loggers.neptune.Run", new=mock.Mock), mock.patch("lightning.pytorch.loggers.neptune.Handler", new=mock.Mock), + mock.patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock), mock.patch("lightning.pytorch.loggers.wandb.wandb"), mock.patch("lightning.pytorch.loggers.wandb.Run", new=mock.Mock), ) diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index a560586198f65..f2e69413f1b1e 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -200,6 +200,7 @@ def _fit_and_test(self, logger, model): assert trainer.log_dir == os.path.join(os.getcwd(), ".neptune") @pytest.mark.usefixtures("tmpdir_unittest_fixture") + @patch("lightning.pytorch.loggers.neptune.File") def test_neptune_leave_open_experiment_after_fit(self, neptune): """Verify that neptune experiment was NOT closed after training.""" # given @@ -215,6 +216,7 @@ def test_neptune_leave_open_experiment_after_fit(self, neptune): assert run_instance_mock.stop.call_count == 0 @pytest.mark.usefixtures("tmpdir_unittest_fixture") + @patch("lightning.pytorch.loggers.neptune.File") def test_neptune_log_metrics_on_trained_model(self, neptune): """Verify that trained models do log data.""" # given @@ -303,6 +305,7 @@ def test_log_model_summary(self, neptune): self.assertEqual(run_instance_mock.__getitem__.call_count, 0) run_instance_mock.__setitem__.assert_called_once_with(model_summary_key, file_from_content_mock) + @patch("lightning.pytorch.loggers.neptune.File") def test_after_save_checkpoint(self, neptune): test_variants = [ ({}, "training/model"), From 358febc9abe956853cd88f251ba657cb15b51df2 Mon Sep 17 00:00:00 2001 From: Aleksander Wojnarowicz Date: Wed, 26 Apr 2023 14:26:34 +0200 Subject: [PATCH 4/6] add new param to the patch decorators --- tests/tests_pytorch/loggers/test_neptune.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index f2e69413f1b1e..4498473ea6326 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -200,7 +200,7 @@ def _fit_and_test(self, logger, model): assert trainer.log_dir == os.path.join(os.getcwd(), ".neptune") @pytest.mark.usefixtures("tmpdir_unittest_fixture") - @patch("lightning.pytorch.loggers.neptune.File") + @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock) def test_neptune_leave_open_experiment_after_fit(self, neptune): """Verify that neptune experiment was NOT closed after training.""" # given @@ -216,7 +216,7 @@ def test_neptune_leave_open_experiment_after_fit(self, neptune): assert run_instance_mock.stop.call_count == 0 @pytest.mark.usefixtures("tmpdir_unittest_fixture") - @patch("lightning.pytorch.loggers.neptune.File") + @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock) def test_neptune_log_metrics_on_trained_model(self, neptune): """Verify that trained models do log data.""" # given @@ -305,7 +305,7 @@ def test_log_model_summary(self, neptune): self.assertEqual(run_instance_mock.__getitem__.call_count, 0) run_instance_mock.__setitem__.assert_called_once_with(model_summary_key, file_from_content_mock) - @patch("lightning.pytorch.loggers.neptune.File") + @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock) def test_after_save_checkpoint(self, neptune): test_variants = [ ({}, "training/model"), From fc87698625341cf32ce685c3d021ae2699bada96 Mon Sep 17 00:00:00 2001 From: Aleksander Wojnarowicz Date: Thu, 27 Apr 2023 11:30:54 +0200 Subject: [PATCH 5/6] adjust tests for new checkpoint saving logic --- tests/tests_pytorch/loggers/test_all.py | 2 +- tests/tests_pytorch/loggers/test_neptune.py | 28 ++++++++++----------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index 73860fa7e29fa..f9ba0cff2e4f9 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -48,7 +48,7 @@ mock.patch("lightning.pytorch.loggers.neptune._NEPTUNE_AVAILABLE", return_value=True), mock.patch("lightning.pytorch.loggers.neptune.Run", new=mock.Mock), mock.patch("lightning.pytorch.loggers.neptune.Handler", new=mock.Mock), - mock.patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock), + mock.patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock()), mock.patch("lightning.pytorch.loggers.wandb.wandb"), mock.patch("lightning.pytorch.loggers.wandb.Run", new=mock.Mock), ) diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index 4498473ea6326..444e17808aa6f 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -200,7 +200,7 @@ def _fit_and_test(self, logger, model): assert trainer.log_dir == os.path.join(os.getcwd(), ".neptune") @pytest.mark.usefixtures("tmpdir_unittest_fixture") - @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock) + @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock()) def test_neptune_leave_open_experiment_after_fit(self, neptune): """Verify that neptune experiment was NOT closed after training.""" # given @@ -216,7 +216,7 @@ def test_neptune_leave_open_experiment_after_fit(self, neptune): assert run_instance_mock.stop.call_count == 0 @pytest.mark.usefixtures("tmpdir_unittest_fixture") - @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock) + @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock()) def test_neptune_log_metrics_on_trained_model(self, neptune): """Verify that trained models do log data.""" # given @@ -305,7 +305,7 @@ def test_log_model_summary(self, neptune): self.assertEqual(run_instance_mock.__getitem__.call_count, 0) run_instance_mock.__setitem__.assert_called_once_with(model_summary_key, file_from_content_mock) - @patch("lightning.pytorch.loggers.neptune.File", new=mock.Mock) + @patch("builtins.open", mock.mock_open(read_data="test")) def test_after_save_checkpoint(self, neptune): test_variants = [ ({}, "training/model"), @@ -330,26 +330,24 @@ def test_after_save_checkpoint(self, neptune): best_model_score=None, ) - # when: save checkpoint - logger.after_save_checkpoint(cb_mock) + with patch("lightning.pytorch.loggers.neptune.File.from_stream", side_effect=mock.Mock()) as mock_file: + # when: save checkpoint + logger.after_save_checkpoint(cb_mock) # then: - self.assertEqual(run_instance_mock.__setitem__.call_count, 1) - self.assertEqual(run_instance_mock.__getitem__.call_count, 4) - self.assertEqual(run_attr_mock.upload.call_count, 4) - run_instance_mock.__setitem__.assert_called_once_with( - f"{model_key_prefix}/best_model_path", os.path.join(models_root_dir, "best_model") - ) - run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/last") + self.assertEqual(run_instance_mock.__setitem__.call_count, 3) + self.assertEqual(run_instance_mock.__getitem__.call_count, 2) + self.assertEqual(run_attr_mock.upload.call_count, 2) + + self.assertEqual(mock_file.call_count, 2) + run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model1") run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model2/with/slashes") - run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/best_model") + run_attr_mock.upload.assert_has_calls( [ - call(os.path.join(models_root_dir, "last")), call(os.path.join(models_root_dir, "model1")), call(os.path.join(models_root_dir, "model2/with/slashes")), - call(os.path.join(models_root_dir, "best_model")), ] ) From 8cac366046c4f23026489565273eb3ff9499a28e Mon Sep 17 00:00:00 2001 From: Aleksander Wojnarowicz Date: Thu, 27 Apr 2023 11:58:24 +0200 Subject: [PATCH 6/6] mock entire File, not from_stream method --- tests/tests_pytorch/loggers/test_neptune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index 444e17808aa6f..16ee33a5ecc89 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -330,7 +330,7 @@ def test_after_save_checkpoint(self, neptune): best_model_score=None, ) - with patch("lightning.pytorch.loggers.neptune.File.from_stream", side_effect=mock.Mock()) as mock_file: + with patch("lightning.pytorch.loggers.neptune.File", side_effect=mock.Mock()) as mock_file: # when: save checkpoint logger.after_save_checkpoint(cb_mock) @@ -339,7 +339,7 @@ def test_after_save_checkpoint(self, neptune): self.assertEqual(run_instance_mock.__getitem__.call_count, 2) self.assertEqual(run_attr_mock.upload.call_count, 2) - self.assertEqual(mock_file.call_count, 2) + self.assertEqual(mock_file.from_stream.call_count, 2) run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model1") run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model2/with/slashes")