Skip to content

Commit

Permalink
Add a fix for single-GPU nsys.
Browse files Browse the repository at this point in the history
The SingleDeviceStrategy never sets the requisite attribute;
workaround it in the profiling code.

Signed-off-by: Tom Fogal <60981+tfogal@users.noreply.github.com>
  • Loading branch information
tfogal committed Nov 21, 2024
1 parent 05b7d4f commit ecc8a1f
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions nemo/lightning/pytorch/callbacks/nsys.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,14 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
"""

device = trainer.strategy.root_device
current_step = trainer.strategy.current_epoch_step
try:
# Not all strategies have this. e.g.:
# AttributeError: 'SingleDeviceStrategy' object has no attribute 'current_epoch_step'
current_step = trainer.strategy.current_epoch_step
except AttributeError:
current_step = self._nsys_profile_start_step
if device.type == 'cuda':
if current_step == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
logging.info("====== Start nsys profiling ======")
torch.cuda.cudart().cudaProfilerStart()
if self._nsys_profile_gen_shape:
torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
Expand All @@ -91,9 +95,11 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
"""

device = trainer.strategy.root_device
current_step = trainer.strategy.current_epoch_step
try:
current_step = trainer.strategy.current_epoch_step
except AttributeError:
current_step = self._nsys_profile_end_step
if device.type == 'cuda':
if current_step == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
logging.info("====== End nsys profiling ======")
torch.cuda.cudart().cudaProfilerStop()
torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)

0 comments on commit ecc8a1f

Please sign in to comment.