From 39e852b04cff2df21afd24ab171ca95fff4552c8 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 27 Apr 2021 02:15:28 +0530 Subject: [PATCH 1/4] Add Debug flag to TPU Training Plugins --- pytorch_lightning/accelerators/tpu.py | 4 +++- pytorch_lightning/plugins/training_type/single_tpu.py | 3 ++- pytorch_lightning/plugins/training_type/tpu_spawn.py | 7 ++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index 2f8852159b4f8..6bbf88e35d026 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import Any, Callable, Union from torch.optim import Optimizer @@ -51,7 +52,8 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None: return super().setup(trainer, model) def teardown(self) -> None: - pass + if "PT_XLA_DEBUG" in os.environ: + del os.environ["PT_XLA_DEBUG"] def run_optimizer_step( self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index 60cfaef9842fa..a4c0cd4d41a37 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -24,11 +24,12 @@ class SingleTPUPlugin(SingleDevicePlugin): """ Plugin for training on a single TPU device. """ - def __init__(self, device: int): + def __init__(self, device: int, debug: bool = False): device = xm.xla_device(device) super().__init__(device) + self.debug = debug self.tpu_local_core_rank = 0 self.tpu_global_core_rank = 0 diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 299c362e7181a..469d556a00cf7 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -49,8 +49,9 @@ class TPUSpawnPlugin(DDPSpawnPlugin): """ Plugin for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method. """ - def __init__(self, parallel_devices: Optional[List[int]] = None, **_: Any) -> None: + def __init__(self, parallel_devices: Optional[List[int]] = None, debug: bool = False, **_: Any) -> None: super().__init__(parallel_devices, num_nodes=1, cluster_environment=None, sync_batchnorm=False) + self.debug = debug self.tpu_local_core_rank = 0 self.tpu_global_core_rank = 0 self.start_method = None @@ -104,6 +105,10 @@ def connect(self, model: 'pl.LightningModule') -> None: self.wrapped_model = xmp.MpModelWrapper(LightningDistributedModule(model)) return super().connect(model) + def pre_dispatch(self): + if self.debug: + os.environ["PT_XLA_DEBUG"] = str(1) + def setup(self, model: Module) -> Module: self.create_mp_queue() return self.model From 33c29978706d1bf663ec0f6499f0654b030bd849 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 27 Apr 2021 02:54:15 +0530 Subject: [PATCH 2/4] Add test --- CHANGELOG.md | 3 +++ .../plugins/training_type/single_tpu.py | 5 ++++ tests/models/test_tpu.py | 25 +++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23e7ced49d910..43d83d8ce6103 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,6 +114,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868)) +- Added `debug` flag to TPU Training Plugins ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219)) + + ### Changed diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index a4c0cd4d41a37..fce325f322cc3 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os + import torch from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin @@ -48,6 +50,9 @@ def pre_dispatch(self) -> None: if isinstance(self.device, int): self.device = xm.xla_device(self.device) + if self.debug: + os.environ["PT_XLA_DEBUG"] = str(1) + self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 7154e036bcbf5..7b9b54034140a 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -442,3 +442,28 @@ def test_sync_dist(rank): assert res["test_tensor"].item() == 8, "Result-Log does not work properly with TPU Spawn and Tensors" xmp.spawn(test_sync_dist, nprocs=8, start_method='fork') + + +@RunIf(tpu=True) +@pl_multi_process_test +def test_tpu_debug_mode(tmpdir): + """Test if debug mode works on TPU.""" + + class DebugModel(BoringModel): + + def on_train_start(self): + assert os.environ.get("PT_XLA_DEBUG") == str(1), "PT_XLA_DEBUG was not set in environment variables" + + tutils.reset_seed() + trainer_options = dict( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=4, + tpu_cores=8, + limit_train_batches=0.4, + limit_val_batches=0.4, + plugins=TPUSpawnPlugin(debug=True), + ) + + model = DebugModel() + tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) From de78c60006f08755b6fd33b689180cff558604e3 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 27 Apr 2021 03:17:51 +0530 Subject: [PATCH 3/4] Add teardown to test --- tests/models/test_tpu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 7b9b54034140a..39be1620909ee 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -454,6 +454,9 @@ class DebugModel(BoringModel): def on_train_start(self): assert os.environ.get("PT_XLA_DEBUG") == str(1), "PT_XLA_DEBUG was not set in environment variables" + def teardown(self, stage): + assert "PT_XLA_DEBUG" not in os.environ + tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, From 9086b5e9b4900e0f8e3ea18ac29c3519bc955f87 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Wed, 28 Apr 2021 01:39:12 +0530 Subject: [PATCH 4/4] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43d83d8ce6103..cb437e2c74b44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,7 +114,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868)) -- Added `debug` flag to TPU Training Plugins ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219)) +- Added `debug` flag to TPU Training Plugins (PT_XLA_DEBUG) ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219))