Add a fix for single-GPU nsys.

The SingleDeviceStrategy never sets the requisite attribute; workaround it in the profiling code. Signed-off-by: Tom Fogal <60981+tfogal@users.noreply.github.com>
NVIDIA · Nov 21, 2024 · ecc8a1f · ecc8a1f
1 parent 05b7d4f
commit ecc8a1f
Showing 1 changed file with 10 additions and 4 deletions.
diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -74,10 +74,14 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
         """
 
         device = trainer.strategy.root_device
-        current_step = trainer.strategy.current_epoch_step
+        try:
+            # Not all strategies have this. e.g.:
+            #    AttributeError: 'SingleDeviceStrategy' object has no attribute 'current_epoch_step'
+            current_step = trainer.strategy.current_epoch_step
+        except AttributeError:
+            current_step = self._nsys_profile_start_step
         if device.type == 'cuda':
             if current_step == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
-                logging.info("====== Start nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStart()
                 if self._nsys_profile_gen_shape:
                     torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
@@ -91,9 +95,11 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
         """
 
         device = trainer.strategy.root_device
-        current_step = trainer.strategy.current_epoch_step
+        try:
+            current_step = trainer.strategy.current_epoch_step
+        except AttributeError:
+            current_step = self._nsys_profile_end_step
         if device.type == 'cuda':
             if current_step == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
-                logging.info("====== End nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStop()
                 torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)