Closed
Description
Bug description
When using deepspeed_stage_3 or deepspeed_stage_3_offload, if the GPU id is not specified as devices=[0]
but the other id like device=[1]
, the error would be raised.
How to reproduce the bug
import os
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
def run():
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
trainer = Trainer(
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
limit_test_batches=1,
num_sanity_val_steps=0,
max_epochs=1,
accelerator='gpu',
devices=[1],
enable_model_summary=False,
precision=16,
strategy='deepspeed_stage_3',
)
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run()
Error messages and logs
Traceback (most recent call last):
File "run.py", line 71, in <module>
run()
File "run.py", line 66, in run
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit
self._call_and_handle_interrupt(
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 648, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run
results = self._run_stage()
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage
return self._run_train()
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train
self.fit_loop.run()
File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance
batch_output = self.batch_loop.run(kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1550, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1705, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 289, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 111, in optimizer_step
closure_result = closure()
File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in __call__
self._result = self.closure(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 132, in closure
step_output = self._step_fn()
File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 407, in _training_step
training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 352, in training_step
return self.model(*args, **kwargs)
File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1318, in forward
loss = self.module(*inputs, **kwargs)
File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 79, in forward
output = self.module.training_step(*inputs, **kwargs)
File "run.py", line 31, in training_step
loss = self(batch).sum()
File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "run.py", line 28, in forward
return self.layer(x)
File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
File "/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 110, in decorate_fwd
return fwd(*args, **kwargs)
File "/home/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/linear.py", line 58, in forward
ret = torch.addmm(bias, input, weight.t())
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat1 in method wrapper_addmm)
Environment
* CUDA:
- GPU:
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- NVIDIA A100-SXM-80GB
- available: True
- version: 11.3
* Lightning:
- pytorch-lightning: 1.7.7
- torch: 1.12.1+cu113
- torchmetrics: 0.9.3
* System:
- OS: Linux
- architecture:
- 64bit
- ELF
- processor: x86_64
- python: 3.8.10
- version: #1 SMP Fri May 8 10:59:10 UTC 2020
More info
No response
cc @awaelchli