From 39e852b04cff2df21afd24ab171ca95fff4552c8 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 27 Apr 2021 02:15:28 +0530
Subject: [PATCH 1/4] Add Debug flag to TPU Training Plugins

---
 pytorch_lightning/accelerators/tpu.py                 | 4 +++-
 pytorch_lightning/plugins/training_type/single_tpu.py | 3 ++-
 pytorch_lightning/plugins/training_type/tpu_spawn.py  | 7 ++++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 2f8852159b4f8..6bbf88e35d026 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from typing import Any, Callable, Union
 
 from torch.optim import Optimizer
@@ -51,7 +52,8 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None:
         return super().setup(trainer, model)
 
     def teardown(self) -> None:
-        pass
+        if "PT_XLA_DEBUG" in os.environ:
+            del os.environ["PT_XLA_DEBUG"]
 
     def run_optimizer_step(
         self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py
index 60cfaef9842fa..a4c0cd4d41a37 100644
--- a/pytorch_lightning/plugins/training_type/single_tpu.py
+++ b/pytorch_lightning/plugins/training_type/single_tpu.py
@@ -24,11 +24,12 @@
 class SingleTPUPlugin(SingleDevicePlugin):
     """ Plugin for training on a single TPU device. """
 
-    def __init__(self, device: int):
+    def __init__(self, device: int, debug: bool = False):
 
         device = xm.xla_device(device)
         super().__init__(device)
 
+        self.debug = debug
         self.tpu_local_core_rank = 0
         self.tpu_global_core_rank = 0
 
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 299c362e7181a..469d556a00cf7 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -49,8 +49,9 @@
 class TPUSpawnPlugin(DDPSpawnPlugin):
     """ Plugin for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method. """
 
-    def __init__(self, parallel_devices: Optional[List[int]] = None, **_: Any) -> None:
+    def __init__(self, parallel_devices: Optional[List[int]] = None, debug: bool = False, **_: Any) -> None:
         super().__init__(parallel_devices, num_nodes=1, cluster_environment=None, sync_batchnorm=False)
+        self.debug = debug
         self.tpu_local_core_rank = 0
         self.tpu_global_core_rank = 0
         self.start_method = None
@@ -104,6 +105,10 @@ def connect(self, model: 'pl.LightningModule') -> None:
         self.wrapped_model = xmp.MpModelWrapper(LightningDistributedModule(model))
         return super().connect(model)
 
+    def pre_dispatch(self):
+        if self.debug:
+            os.environ["PT_XLA_DEBUG"] = str(1)
+
     def setup(self, model: Module) -> Module:
         self.create_mp_queue()
         return self.model

From 33c29978706d1bf663ec0f6499f0654b030bd849 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 27 Apr 2021 02:54:15 +0530
Subject: [PATCH 2/4] Add test

---
 CHANGELOG.md                                  |  3 +++
 .../plugins/training_type/single_tpu.py       |  5 ++++
 tests/models/test_tpu.py                      | 25 +++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 23e7ced49d910..43d83d8ce6103 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -114,6 +114,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868))
 
 
+- Added `debug` flag to TPU Training Plugins ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219))
+
+
 
 ### Changed
 
diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py
index a4c0cd4d41a37..fce325f322cc3 100644
--- a/pytorch_lightning/plugins/training_type/single_tpu.py
+++ b/pytorch_lightning/plugins/training_type/single_tpu.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+
 import torch
 
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
@@ -48,6 +50,9 @@ def pre_dispatch(self) -> None:
         if isinstance(self.device, int):
             self.device = xm.xla_device(self.device)
 
+        if self.debug:
+            os.environ["PT_XLA_DEBUG"] = str(1)
+
         self.tpu_local_core_rank = xm.get_local_ordinal()
         self.tpu_global_core_rank = xm.get_ordinal()
 
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 7154e036bcbf5..7b9b54034140a 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -442,3 +442,28 @@ def test_sync_dist(rank):
         assert res["test_tensor"].item() == 8, "Result-Log does not work properly with TPU Spawn and Tensors"
 
     xmp.spawn(test_sync_dist, nprocs=8, start_method='fork')
+
+
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_tpu_debug_mode(tmpdir):
+    """Test if debug mode works on TPU."""
+
+    class DebugModel(BoringModel):
+
+        def on_train_start(self):
+            assert os.environ.get("PT_XLA_DEBUG") == str(1), "PT_XLA_DEBUG was not set in environment variables"
+
+    tutils.reset_seed()
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=4,
+        tpu_cores=8,
+        limit_train_batches=0.4,
+        limit_val_batches=0.4,
+        plugins=TPUSpawnPlugin(debug=True),
+    )
+
+    model = DebugModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)

From de78c60006f08755b6fd33b689180cff558604e3 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 27 Apr 2021 03:17:51 +0530
Subject: [PATCH 3/4] Add teardown to test

---
 tests/models/test_tpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 7b9b54034140a..39be1620909ee 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -454,6 +454,9 @@ class DebugModel(BoringModel):
         def on_train_start(self):
             assert os.environ.get("PT_XLA_DEBUG") == str(1), "PT_XLA_DEBUG was not set in environment variables"
 
+        def teardown(self, stage):
+            assert "PT_XLA_DEBUG" not in os.environ
+
     tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,

From 9086b5e9b4900e0f8e3ea18ac29c3519bc955f87 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 28 Apr 2021 01:39:12 +0530
Subject: [PATCH 4/4] Update changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 43d83d8ce6103..cb437e2c74b44 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -114,7 +114,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868))
 
 
-- Added `debug` flag to TPU Training Plugins ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219))
+- Added `debug` flag to TPU Training Plugins (PT_XLA_DEBUG) ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219))