-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Closed
Description
Hi - I'm getting a new error while trying to train a model on a 8 x V100 box. I'm using pytorch lightning but don't think that should make a difference too much.
Sys config:
Pytorch 1.8
Cuda 10.2
Ubuntu 18.04
Deepspeed 0.3.14
Triton 0.2.3
Apex master branch
Pytorch lightning 1.3.0rc1
Error trace:
Epoch 0: 0%| | 0/564 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 488, in fit
self.dispatch()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 531, in dispatch
self.accelerator.start_training(self)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 95, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 142, in start_training
self._results = trainer.run_stage()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in run_stage
self.run_train()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 607, in run_train
self.train_loop.run_training_epoch()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 422, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 575, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 370, in optimizer_step
using_lbfgs=is_lbfgs,
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1414, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 214, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 301, in optimizer_step
self.lightning_module, optimizer, opt_idx, lambda_closure, **kwargs
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/deepspeed_precision.py", line 47, in pre_optimizer_step
lambda_closure()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 570, in train_step_and_backward_closure
split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 673, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 709, in backward
result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 284, in backward
self.lightning_module, closure_loss, optimizer, optimizer_idx, should_accumulate, *args, **kwargs
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/deepspeed_precision.py", line 73, in backward
deepspeed_engine.backward(closure_loss, *args, **kwargs)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/engine.py", line 1020, in backward
self.allreduce_gradients()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/engine.py", line 940, in allreduce_gradients
self.optimizer.overlapping_partition_gradients_reduce_epilogue()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py", line 1393, in overlapping_partition_gradients_reduce_epilogue
self.independent_gradient_partition_epilogue()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py", line 1295, in independent_gradient_partition_epilogue
self.partition_previous_reduced_grads()
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/zero/stage3.py", line 1657, in partition_previous_reduced_grads
param.partition_gradients(partition_buffers=self.temp_grad_gpu_buffer)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 460, in partition_gradients
accumulate=accumulate)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 794, in _partition_gradients
accumulate=accumulate)
File "/home/ubuntu/anaconda3/envs/torch/lib/python3.7/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 862, in _partition_gradient
param.grad.data = dest_tensor_full_buffer.data
UnboundLocalError: local variable 'dest_tensor_full_buffer' referenced before assignmentReactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels