From 3aa1b06bc82bbf78262c36e2f197bcb4d1faacaf Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 19:44:28 +0100 Subject: [PATCH 1/8] drop unused pl model in ckpt --- pytorch_lightning/callbacks/model_checkpoint.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index ec735eb5363a2..ab6e09f3d9155 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -291,8 +291,7 @@ def _del_model(self, filepath: str): self._fs.rm(filepath) log.debug(f"Removed checkpoint: {filepath}") - def _save_model(self, filepath: str, trainer, pl_module): - # Todo: required argument `pl_module` is not used + def _save_model(self, filepath: str, trainer): # in debugging, track when we save checkpoints trainer.dev_debugger.track_checkpointing_history(filepath) @@ -505,9 +504,9 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics): if trainer.training_type_plugin.rpc_enabled: # RPCPlugin manages saving all model states - trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module) + trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer) else: - self._save_model(last_filepath, trainer, pl_module) + self._save_model(last_filepath, trainer) if ( self.last_model_path and self.last_model_path != last_filepath and (self.save_top_k != -1 or self.save_last) and trainer.is_global_zero @@ -574,7 +573,7 @@ def _update_best_and_save( f"Epoch {epoch:d}, global step {step:d}: {self.monitor} reached {current:0.5f}" f' (best {self.best_model_score:0.5f}), saving model to "{filepath}" as top {k}' ) - self._save_model(filepath, trainer, pl_module) + self._save_model(filepath, trainer) if del_filepath is not None and filepath != del_filepath: self._del_model(del_filepath) From 58eca725291ea57a62ce7778bb859a19dd838f3b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 19:52:20 +0100 Subject: [PATCH 2/8] irelevant --- pytorch_lightning/trainer/connectors/slurm_connector.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py index f2bb00abd84bd..54529e3346f0f 100644 --- a/pytorch_lightning/trainer/connectors/slurm_connector.py +++ b/pytorch_lightning/trainer/connectors/slurm_connector.py @@ -28,8 +28,6 @@ def register_slurm_signal_handlers(self): signal.signal(signal.SIGTERM, self.term_handler) def sig_handler(self, signum, frame): # pragma: no-cover - # Todo: required argument `signum` is not used - # Todo: required argument `frame` is not used if self.trainer.is_global_zero: # save weights log.info('handling SIGUSR1') @@ -59,7 +57,5 @@ def sig_handler(self, signum, frame): # pragma: no-cover # close experiment to avoid issues self.trainer.logger.close() - def term_handler(self, signum, frame): - # Todo: required argument `signum` is not used - # Todo: required argument `frame` is not used + def term_handler(self, signum, frame): # pragma: no-cover log.info("bypassing sigterm") From 83039478f66c3fcb4cebb926ac4922711f7be8f5 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 19:52:27 +0100 Subject: [PATCH 3/8] on_evaluation_batch_start --- .../trainer/connectors/logger_connector/logger_connector.py | 3 +-- pytorch_lightning/trainer/evaluation_loop.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 8ebec3238e276..b0d9caa55df7b 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -101,8 +101,7 @@ def check_logging_in_callbacks(self, hook_fx_name, on_step: bool = None, on_epoc current_hook_fx_name=hook_fx_name, on_step=on_step, on_epoch=on_epoch ) - def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders): - # Todo: required argument `testing` is not used + def on_evaluation_batch_start(self, batch, dataloader_idx, num_dataloaders): model = self.trainer.lightning_module # set dataloader_idx only if multiple ones model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 087741aa69c2b..0b5c0eb9326f8 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -283,9 +283,7 @@ def _convert_to_numpy(v): def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): # set dataloader_idx to model and track batch_size - self.trainer.logger_connector.on_evaluation_batch_start( - self.trainer.testing, batch, dataloader_idx, self.num_dataloaders - ) + self.trainer.logger_connector.on_evaluation_batch_start(batch, dataloader_idx, self.num_dataloaders) if self.trainer.testing: self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx) From 900ed4d8b6f83f5978695d54181d54ee16c99dd1 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 19:54:25 +0100 Subject: [PATCH 4/8] evaluation_epoch_end --- .../trainer/connectors/logger_connector/logger_connector.py | 3 +-- pytorch_lightning/trainer/evaluation_loop.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index b0d9caa55df7b..45cdecfdc8515 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -259,8 +259,7 @@ def track_metrics_deprecated(self, deprecated_eval_results): self._track_callback_metrics(deprecated_eval_results) self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results) - def evaluation_epoch_end(self, testing): - # Todo: required argument `testing` is not used + def evaluation_epoch_end(self): # reset dataloader idx model_ref = self.trainer.lightning_module model_ref._current_dataloader_idx = None diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 0b5c0eb9326f8..9272f8e2be924 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -181,7 +181,7 @@ def evaluation_step_end(self, *args, **kwargs): def evaluation_epoch_end(self): # unset dataloder_idx in model - self.trainer.logger_connector.evaluation_epoch_end(self.trainer.testing) + self.trainer.logger_connector.evaluation_epoch_end() # call the model epoch end deprecated_results = self.__run_eval_epoch_end(self.num_dataloaders) From de7a9e497b4e2f10139147a273c62080d600356e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 19:55:44 +0100 Subject: [PATCH 5/8] attach_datamodule --- pytorch_lightning/trainer/connectors/data_connector.py | 5 ++--- pytorch_lightning/trainer/trainer.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 6ff35aadc36a3..97484e5f473fd 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -81,7 +81,7 @@ def attach_data(self, model, train_dataloader, val_dataloaders, datamodule): # set up the passed in dataloaders (if needed) self.attach_dataloaders(model, train_dataloader, val_dataloaders) - self.attach_datamodule(model, datamodule, 'fit') + self.attach_datamodule(model, datamodule) def __enforce_datamodule_dataloader_override(self, train_dataloader, val_dataloaders, datamodule): # If you supply a datamodule you can't supply train_dataloader or val_dataloaders @@ -112,8 +112,7 @@ def attach_dataloaders( if predict_dataloaders is not None: model.predict_dataloader = _PatchDataLoader(predict_dataloaders) - def attach_datamodule(self, model, datamodule: Optional[LightningDataModule], stage: str) -> None: - # Todo: required argument `stage` is not used + def attach_datamodule(self, model, datamodule: Optional[LightningDataModule]) -> None: # We use datamodule if it's been provided on .fit or .test, otherwise we check model for it datamodule = datamodule or getattr(model, 'datamodule', None) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7a3dde64ddf98..bf20ec0fd5ae3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -880,7 +880,7 @@ def test( ) # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model or self.lightning_module, datamodule, 'test') + self.data_connector.attach_datamodule(model or self.lightning_module, datamodule) if model is not None: results = self.__test_given_model(model, test_dataloaders) @@ -989,7 +989,7 @@ def predict( if datamodule is not None: # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model, datamodule, 'predict') + self.data_connector.attach_datamodule(model, datamodule) # attach data if dataloaders is not None: From dc9a53f1d394a0566da03cf76e1617e098a652ab Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 20:20:54 +0100 Subject: [PATCH 6/8] rpc --- pytorch_lightning/plugins/training_type/rpc.py | 2 +- tests/plugins/test_rpc_plugin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 3c016f3cb8e25..376adfb098b4a 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -63,7 +63,7 @@ def init_rpc_connection(self, global_rank: int, world_size: int) -> None: rpc._set_rpc_timeout(self.rpc_timeout_sec) self._is_rpc_initialized = True - def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: """ Override to save model to disk. This is required as the main process will be required to handle aggregating model states from RPC processes. diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py index 5f907c39f344c..62630cf366fd8 100644 --- a/tests/plugins/test_rpc_plugin.py +++ b/tests/plugins/test_rpc_plugin.py @@ -58,7 +58,7 @@ def __init__(self, **kwargs): self.rpc_save_model_count = 0 self.worker_optimizer_step_count = 0 - def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: self.rpc_save_model_count += 1 def barrier(self, name: Optional[str] = None) -> None: From 6ab1ea567ed00af8d70925fe360a1b39a3d7a697 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 20:47:46 +0100 Subject: [PATCH 7/8] rpc --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- pytorch_lightning/plugins/training_type/rpc.py | 2 +- tests/plugins/test_rpc_plugin.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index ab6e09f3d9155..b1f306b56a7fa 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -504,7 +504,7 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics): if trainer.training_type_plugin.rpc_enabled: # RPCPlugin manages saving all model states - trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer) + trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module) else: self._save_model(last_filepath, trainer) if ( diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 376adfb098b4a..3c016f3cb8e25 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -63,7 +63,7 @@ def init_rpc_connection(self, global_rank: int, world_size: int) -> None: rpc._set_rpc_timeout(self.rpc_timeout_sec) self._is_rpc_initialized = True - def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: """ Override to save model to disk. This is required as the main process will be required to handle aggregating model states from RPC processes. diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py index 62630cf366fd8..5f907c39f344c 100644 --- a/tests/plugins/test_rpc_plugin.py +++ b/tests/plugins/test_rpc_plugin.py @@ -58,7 +58,7 @@ def __init__(self, **kwargs): self.rpc_save_model_count = 0 self.worker_optimizer_step_count = 0 - def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: self.rpc_save_model_count += 1 def barrier(self, name: Optional[str] = None) -> None: From 424b71e145125a0c68ae1c56b3bd4c20d71cee85 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 2 Mar 2021 21:16:03 +0100 Subject: [PATCH 8/8] rpc --- pytorch_lightning/callbacks/model_checkpoint.py | 6 +++--- pytorch_lightning/plugins/training_type/rpc.py | 3 +-- .../plugins/training_type/rpc_sequential.py | 10 +++++----- tests/plugins/test_rpc_plugin.py | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index b1f306b56a7fa..d552560191a35 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -239,7 +239,7 @@ def save_checkpoint(self, trainer, pl_module): self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates) # Mode 2: save the last checkpoint - self._save_last_checkpoint(trainer, pl_module, monitor_candidates) + self._save_last_checkpoint(trainer, monitor_candidates) def __validate_init_configuration(self): if self.save_top_k is not None and self.save_top_k < -1: @@ -480,7 +480,7 @@ def _monitor_candidates(self, trainer): monitor_candidates.update(step=trainer.global_step, epoch=trainer.current_epoch) return monitor_candidates - def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics): + def _save_last_checkpoint(self, trainer, ckpt_name_metrics): should_save_last = self.monitor is None or self.save_last if not should_save_last: return @@ -504,7 +504,7 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics): if trainer.training_type_plugin.rpc_enabled: # RPCPlugin manages saving all model states - trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module) + trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer) else: self._save_model(last_filepath, trainer) if ( diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 3c016f3cb8e25..0e8ca557e447b 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -63,7 +63,7 @@ def init_rpc_connection(self, global_rank: int, world_size: int) -> None: rpc._set_rpc_timeout(self.rpc_timeout_sec) self._is_rpc_initialized = True - def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: """ Override to save model to disk. This is required as the main process will be required to handle aggregating model states from RPC processes. @@ -72,7 +72,6 @@ def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> No save_model_fn: The saving function to save final model. last_filepath: The filepath to save the model to. trainer: The trainer object. - pl_module: The LightningModule. """ raise NotImplementedError diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py index 3878aa9db3ea4..329c82b5ed7f8 100644 --- a/pytorch_lightning/plugins/training_type/rpc_sequential.py +++ b/pytorch_lightning/plugins/training_type/rpc_sequential.py @@ -266,17 +266,17 @@ def configure_ddp(self): self._model.require_backward_grad_sync = False @rank_zero_only - def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: model = self.lightning_module if not hasattr(model.sequential_module, "foreach_worker"): return - current_layers = pl_module.sequential_module + current_layers = model.sequential_module model.sequential_module.foreach_worker( save_layers_on_all_rank_zero_workers, {"gpus_per_model": self.gpus_per_model}, include_self=True ) - pl_module.sequential_module = load_sequential_from_saved_layers(self.gpus_per_model) - save_model_fn(last_filepath, trainer, pl_module) - pl_module.sequential_module = current_layers + model.sequential_module = load_sequential_from_saved_layers(self.gpus_per_model) + save_model_fn(last_filepath, trainer) + model.sequential_module = current_layers def worker_optimizer_step(self, model: LightningModule, opt_idx: int, *args, **kwargs) -> None: model.sequential_module.foreach_worker( diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py index 5f907c39f344c..62630cf366fd8 100644 --- a/tests/plugins/test_rpc_plugin.py +++ b/tests/plugins/test_rpc_plugin.py @@ -58,7 +58,7 @@ def __init__(self, **kwargs): self.rpc_save_model_count = 0 self.worker_optimizer_step_count = 0 - def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None: + def rpc_save_model(self, save_model_fn, last_filepath, trainer) -> None: self.rpc_save_model_count += 1 def barrier(self, name: Optional[str] = None) -> None: