From bc6210d7c48a55c843ee5fc228078d94205120d6 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Sat, 24 Apr 2021 12:22:09 -0700 Subject: [PATCH 1/6] Update data_connector.py --- pytorch_lightning/trainer/connectors/data_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 5d2f141dc64a8..5068ac34b68ce 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -67,6 +67,7 @@ def prepare_data(self, model): self.trainer.datamodule.prepare_data() model.prepare_data() self.trainer._is_data_prepared = True + self.trainer.accelerator.barrier("prepare_data") def can_prepare_data(self): should_call_dm_prepare_data = True From 31ff5482e751016c859facf98e1c43182074ffd5 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Sun, 25 Apr 2021 12:00:12 -0700 Subject: [PATCH 2/6] move-barrier --- pytorch_lightning/trainer/connectors/data_connector.py | 1 - pytorch_lightning/trainer/trainer.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 5068ac34b68ce..5d2f141dc64a8 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -67,7 +67,6 @@ def prepare_data(self, model): self.trainer.datamodule.prepare_data() model.prepare_data() self.trainer._is_data_prepared = True - self.trainer.accelerator.barrier("prepare_data") def can_prepare_data(self): should_call_dm_prepare_data = True diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 94853faaec025..b4dbd35e5cf95 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -456,7 +456,9 @@ def fit( self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator.connect(model) self.accelerator.setup_environment() + self.accelerator.barrier("pre-setup") self.call_setup_hook(model) # allow user to setup lightning_module in accelerator environment + self.accelerator.barrier("post-setup") self.call_configure_sharded_model(model) # allow user to setup in model sharded environment self.accelerator.setup(self, model) # note: this sets up self.lightning_module From d7d5757252ccdbd27f7b539b6155f5c54d79b260 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Mon, 26 Apr 2021 01:13:51 -0700 Subject: [PATCH 3/6] Update trainer.py --- pytorch_lightning/trainer/trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b4dbd35e5cf95..0b274cd350854 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -456,9 +456,7 @@ def fit( self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator.connect(model) self.accelerator.setup_environment() - self.accelerator.barrier("pre-setup") self.call_setup_hook(model) # allow user to setup lightning_module in accelerator environment - self.accelerator.barrier("post-setup") self.call_configure_sharded_model(model) # allow user to setup in model sharded environment self.accelerator.setup(self, model) # note: this sets up self.lightning_module @@ -1121,6 +1119,7 @@ def call_setup_hook(self, model: LightningModule) -> None: self.setup(model, stage=state) model.setup(stage=state) + self.accelerator.barrier("setup") def call_configure_sharded_model(self, model: LightningModule) -> None: # Call configure sharded model hook if accelerator requests. In some cases From c6e4d62dbf21f337f00a36c174b55d69ebc2c5b1 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Mon, 26 Apr 2021 09:47:53 -0700 Subject: [PATCH 4/6] Update ddp.py --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 186eb582b3264..6ef9793191101 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -285,7 +285,7 @@ def post_dispatch(self) -> None: self.cluster_environment.teardown() def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): + if torch_distrib.is_available() and torch_distrib.is_initialized(): torch_distrib.barrier() def broadcast(self, obj: object, src: int = 0) -> object: From 092e767afc6b7f1bccc21a3f6c962aab0a725232 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Mon, 26 Apr 2021 22:18:35 -0700 Subject: [PATCH 5/6] changelog --- CHANGELOG.md | 3 +++ pytorch_lightning/trainer/trainer.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92968eeb94634..d8b297926c3ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/PyTorchLightning/pytorch-lightning/pull/7202)) + + - Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942)) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0b274cd350854..4f8ef48c1b1ad 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1111,6 +1111,7 @@ def tune( def call_setup_hook(self, model: LightningModule) -> None: assert self.state.running, f"TrainerState: {self.state}" state = self._setup_state + self.accelerator.barrier("pre_setup") if self.datamodule is not None: called = getattr(self.datamodule, f'has_setup_{state}') @@ -1119,7 +1120,7 @@ def call_setup_hook(self, model: LightningModule) -> None: self.setup(model, stage=state) model.setup(stage=state) - self.accelerator.barrier("setup") + self.accelerator.barrier("post_setup") def call_configure_sharded_model(self, model: LightningModule) -> None: # Call configure sharded model hook if accelerator requests. In some cases From da9bd6f920dea4884482729bd6eb8319fca1a9bb Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 27 Apr 2021 16:59:03 +0200 Subject: [PATCH 6/6] Spacing --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4f8ef48c1b1ad..764eb16c95bdb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1111,6 +1111,7 @@ def tune( def call_setup_hook(self, model: LightningModule) -> None: assert self.state.running, f"TrainerState: {self.state}" state = self._setup_state + self.accelerator.barrier("pre_setup") if self.datamodule is not None: @@ -1120,6 +1121,7 @@ def call_setup_hook(self, model: LightningModule) -> None: self.setup(model, stage=state) model.setup(stage=state) + self.accelerator.barrier("post_setup") def call_configure_sharded_model(self, model: LightningModule) -> None: