-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Closed
Labels
fabriclightning.fabric.Fabriclightning.fabric.Fabricstrategy: ddpDistributedDataParallelDistributedDataParalleltests
Milestone
Description
Bug description
___________________ test_amp[cuda-16-mixed-expected_dtype2] ____________________
accelerator = 'cuda', precision = '16-mixed', expected_dtype = torch.float16
@pytest.mark.parametrize(
"accelerator, precision, expected_dtype",
[
("cpu", "16-mixed", torch.bfloat16),
("cpu", "bf16-mixed", torch.bfloat16),
pytest.param("cuda", "16-mixed", torch.float16, marks=RunIf(min_cuda_gpus=1)),
pytest.param("cuda", "bf16-mixed", torch.bfloat16, marks=RunIf(min_cuda_gpus=1, bf16_cuda=True)),
],
)
def test_amp(accelerator, precision, expected_dtype):
fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision)
fabric.expected_dtype = expected_dtype
> fabric.run()
plugins/precision/test_amp_integration.py:73:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../src/lightning_fabric/fabric.py:665: in _run_impl
return self._strategy.launcher.launch(run_method, *args, **kwargs)
../../src/lightning_fabric/strategies/launchers/subprocess_script.py:90: in launch
return function(*args, **kwargs)
../../src/lightning_fabric/fabric.py:675: in _run_with_setup
return run_function(*args, **kwargs)
helpers/models.py:65: in run
model, optimizer = self.setup(model, optimizer)
../../src/lightning_fabric/fabric.py:197: in setup
module, optimizers = self._strategy.setup_module_and_optimizers( # type: ignore[assignment]
../../src/lightning_fabric/strategies/strategy.py:122: in setup_module_and_optimizers
module = self.setup_module(module)
../../src/lightning_fabric/strategies/ddp.py:117: in setup_module
return DistributedDataParallel(module=module, device_ids=self._determine_ddp_device_ids(), **self._ddp_kwargs)
/usr/local/lib/python3.9/dist-packages/torch/nn/parallel/distributed.py:655: in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
process_group = <torch.distributed.distributed_c10d.ProcessGroupNCCL object at 0x7f42f87e2630>
tensors = [Parameter containing:
tensor([[-0.1364, -0.0437, 0.1193, 0.0296, -0.1345, -0.1418, 0.0879, -0.1315,
-0.02...ice='cuda:0', requires_grad=True), Parameter containing:
tensor([0.1219, 0.0867], device='cuda:0', requires_grad=True)]
logger = None
def _verify_param_shape_across_processes(process_group, tensors, logger=None):
> return dist._verify_params_across_processes(process_group, tensors, logger)
E RuntimeError: DDP expects same model across all ranks, but Rank 0 has 2 params, while rank 1 has inconsistent 1 params.
/usr/local/lib/python3.9/dist-packages/torch/distributed/utils.py:112: RuntimeErrorHow to reproduce the bug
Set devices>1
Environment
Reproduces on our CI
More info
Metadata
Metadata
Assignees
Labels
fabriclightning.fabric.Fabriclightning.fabric.Fabricstrategy: ddpDistributedDataParallelDistributedDataParalleltests