Skip to content

Tensors not on the same device when specifying GPU id and using deepspeed_stage_3(_offload) #15066

Closed
@w3nhao

Description

@w3nhao

Bug description

When using deepspeed_stage_3 or deepspeed_stage_3_offload, if the GPU id is not specified as devices=[0] but the other id like device=[1], the error would be raised.

How to reproduce the bug

import os

import torch
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import LightningModule, Trainer


class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


class BoringModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(32, 2)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("valid_loss", loss)

    def test_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("test_loss", loss)

    def configure_optimizers(self):
        return torch.optim.SGD(self.layer.parameters(), lr=0.1)


def run():
    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    test_data = DataLoader(RandomDataset(32, 64), batch_size=2)

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=os.getcwd(),
        limit_train_batches=1,
        limit_val_batches=1,
        limit_test_batches=1,
        num_sanity_val_steps=0,
        max_epochs=1,
        accelerator='gpu',
        devices=[1],
        enable_model_summary=False,
        precision=16,
        strategy='deepspeed_stage_3',
    )
    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
    trainer.test(model, dataloaders=test_data)


if __name__ == "__main__":
    run()

Error messages and logs


Traceback (most recent call last):
  File "run.py", line 71, in <module>
    run()
  File "run.py", line 66, in run
    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit
    self._call_and_handle_interrupt(
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 648, in _call_and_handle_interrupt
    return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
    return function(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl
    results = self._run(model, ckpt_path=self.ckpt_path)
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run
    results = self._run_stage()
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage
    return self._run_train()
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1283, in _run_train
    self.fit_loop.run()
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 271, in advance
    self._outputs = self.epoch_loop.run(self._data_fetcher)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance
    batch_output = self.batch_loop.run(kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance
    outputs = self.optimizer_loop.run(optimizers, kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance
    result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step
    self.trainer._call_lightning_module_hook(
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1550, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1705, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
    step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 289, in optimizer_step
    optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step
    return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 111, in optimizer_step
    closure_result = closure()
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 132, in closure
    step_output = self._step_fn()
  File "/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 407, in _training_step
    training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
  File "/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 352, in training_step
    return self.model(*args, **kwargs)
  File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1318, in forward
    loss = self.module(*inputs, **kwargs)
  File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
    result = forward_call(*input, **kwargs)
  File "/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 79, in forward
    output = self.module.training_step(*inputs, **kwargs)
  File "run.py", line 31, in training_step
    loss = self(batch).sum()
  File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
    result = forward_call(*input, **kwargs)
  File "run.py", line 28, in forward
    return self.layer(x)
  File "/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
    result = forward_call(*input, **kwargs)
  File "/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
  File "/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 110, in decorate_fwd
    return fwd(*args, **kwargs)
  File "/home/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/linear.py", line 58, in forward
    ret = torch.addmm(bias, input, weight.t())
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat1 in method wrapper_addmm)


Environment

* CUDA:
        - GPU:
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
                - NVIDIA A100-SXM-80GB
        - available:         True
        - version:           11.3
* Lightning:
        - pytorch-lightning: 1.7.7
        - torch:             1.12.1+cu113
        - torchmetrics:      0.9.3
* System:
        - OS:                Linux
        - architecture:
                - 64bit
                - ELF
        - processor:         x86_64
        - python:            3.8.10
        - version:           #1 SMP Fri May 8 10:59:10 UTC 2020

More info

No response

cc @awaelchli

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions