diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 442abcfeec020..b3fbf0d17fd4c 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -117,6 +117,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fabric argument validation now only raises an error if conflicting settings are set through the CLI ([#17679](https://github.com/Lightning-AI/lightning/pull/17679)) +- Disabled the auto-detection of the Kubeflow environment ([#18137](https://github.com/Lightning-AI/lightning/pull/18137)) + + ### Deprecated - Deprecated the `DDPStrategy.is_distributed` property. This strategy is distributed by definition ([#17381](https://github.com/Lightning-AI/lightning/pull/17381)) diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py index 0e57d5f2f1df8..8a269cf1a8b61 100644 --- a/src/lightning/fabric/connector.py +++ b/src/lightning/fabric/connector.py @@ -34,7 +34,6 @@ ) from lightning.fabric.plugins.environments import ( ClusterEnvironment, - KubeflowEnvironment, LightningEnvironment, LSFEnvironment, MPIEnvironment, @@ -361,7 +360,6 @@ def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: for env_type in ( SLURMEnvironment, TorchElasticEnvironment, - KubeflowEnvironment, LSFEnvironment, MPIEnvironment, ): diff --git a/src/lightning/fabric/plugins/environments/kubeflow.py b/src/lightning/fabric/plugins/environments/kubeflow.py index 7420b4d861d47..967c4682f2184 100644 --- a/src/lightning/fabric/plugins/environments/kubeflow.py +++ b/src/lightning/fabric/plugins/environments/kubeflow.py @@ -21,7 +21,10 @@ class KubeflowEnvironment(ClusterEnvironment): - """Environment for distributed training using the `PyTorchJob`_ operator from `Kubeflow`_ + """Environment for distributed training using the `PyTorchJob`_ operator from `Kubeflow`_. + + This environment, unlike others, does not get auto-detected and needs to be passed to the Fabric/Trainer + constructor manually. .. _PyTorchJob: https://www.kubeflow.org/docs/components/training/pytorch/ .. _Kubeflow: https://www.kubeflow.org @@ -41,12 +44,7 @@ def main_port(self) -> int: @staticmethod def detect() -> bool: - """Returns ``True`` if the current process was launched using Kubeflow PyTorchJob.""" - required_env_vars = {"KUBERNETES_PORT", "MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"} - # torchelastic sets these. Make sure we're not in torchelastic - excluded_env_vars = {"GROUP_RANK", "LOCAL_RANK", "LOCAL_WORLD_SIZE"} - env_vars = os.environ.keys() - return required_env_vars.issubset(env_vars) and excluded_env_vars.isdisjoint(env_vars) + raise NotImplementedError("The Kubeflow environment can't be detected automatically.") def world_size(self) -> int: return int(os.environ["WORLD_SIZE"]) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index de98f79bbf12b..b5e1daf9b2135 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -123,6 +123,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - During `LightningModule.setup()`, the `self.device` now returns the device the module will be placed on instead of `cpu` ([#18021](https://github.com/Lightning-AI/lightning/pull/18021)) +- Disabled the auto-detection of the Kubeflow environment ([#18137](https://github.com/Lightning-AI/lightning/pull/18137)) + + - Increased the minimum supported `wandb` version for `WandbLogger` from 0.12.0 to 0.12.10 ([#18171](https://github.com/Lightning-AI/lightning/pull/18171)) diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 7e21f6216c1e3..508a6adae1adb 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -22,7 +22,6 @@ from lightning.fabric.connector import _convert_precision_to_unified_args, _PRECISION_INPUT, _PRECISION_INPUT_STR from lightning.fabric.plugins.environments import ( ClusterEnvironment, - KubeflowEnvironment, LightningEnvironment, LSFEnvironment, MPIEnvironment, @@ -397,7 +396,6 @@ def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: for env_type in ( SLURMEnvironment, TorchElasticEnvironment, - KubeflowEnvironment, LSFEnvironment, MPIEnvironment, ): diff --git a/tests/tests_fabric/plugins/environments/test_kubeflow.py b/tests/tests_fabric/plugins/environments/test_kubeflow.py index 11717bf09ab05..3436adc9ce2aa 100644 --- a/tests/tests_fabric/plugins/environments/test_kubeflow.py +++ b/tests/tests_fabric/plugins/environments/test_kubeflow.py @@ -74,30 +74,7 @@ def test_attributes_from_environment_variables(caplog): assert "setting world size is not allowed" in caplog.text -@mock.patch.dict( - os.environ, - { - "KUBERNETES_PORT": "tcp://127.0.0.1:443", - "MASTER_ADDR": "1.2.3.4", - "MASTER_PORT": "500", - "WORLD_SIZE": "20", - "RANK": "1", - }, -) def test_detect_kubeflow(): - assert KubeflowEnvironment.detect() - - -@mock.patch.dict( - os.environ, - { - "KUBERNETES_PORT": "tcp://127.0.0.1:443", - "MASTER_ADDR": "1.2.3.4", - "MASTER_PORT": "500", - "WORLD_SIZE": "20", - "RANK": "1", - "GROUP_RANK": "1", - }, -) -def test_detect_torchelastic_over_kubeflow(): - assert not KubeflowEnvironment.detect() + """Test that the KubeflowEnvironment does not support auto-detection.""" + with pytest.raises(NotImplementedError, match="can't be detected automatically"): + KubeflowEnvironment.detect() diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index 8fa157342e6bc..67eb886166f18 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -602,7 +602,7 @@ def test_strategy_choice_ddp_torchelastic(*_): @mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_kubeflow(*_): - connector = _Connector(accelerator="gpu", devices=2) + connector = _Connector(accelerator="gpu", devices=2, plugins=KubeflowEnvironment()) assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment) @@ -621,7 +621,7 @@ def test_strategy_choice_ddp_kubeflow(*_): }, ) def test_strategy_choice_ddp_cpu_kubeflow(): - connector = _Connector(accelerator="cpu", devices=2) + connector = _Connector(accelerator="cpu", devices=2, plugins=KubeflowEnvironment()) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment) @@ -793,11 +793,11 @@ def test_ddp_fork_on_unsupported_platform(_, __, strategy): pytest.param("bf16-true", "fsdp", FSDPPrecision, marks=RunIf(min_torch="1.12", min_cuda_gpus=1)), pytest.param("16-mixed", "fsdp", FSDPPrecision, marks=RunIf(min_torch="1.12", min_cuda_gpus=1)), pytest.param("bf16-mixed", "fsdp", FSDPPrecision, marks=RunIf(min_torch="1.12", min_cuda_gpus=1)), - pytest.param("32-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True)), - pytest.param("16-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True)), - pytest.param("bf16-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True)), - pytest.param("16-mixed", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True)), - pytest.param("bf16-mixed", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True)), + pytest.param("32-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)), + pytest.param("16-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)), + pytest.param("bf16-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)), + pytest.param("16-mixed", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)), + pytest.param("bf16-mixed", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)), ], ) def test_precision_selection(precision_str, strategy_str, expected_precision_cls): diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 60aa07994c9a4..35e2fce9d6602 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -247,7 +247,7 @@ def test_accelerator_choice_multi_node_gpu(cuda_count_2, tmpdir, strategy, strat assert isinstance(trainer.strategy, strategy_class) -def test_accelerator_cpu(cuda_count_0): +def test_accelerator_cpu(cuda_count_0, mps_count_0): trainer = Trainer(accelerator="cpu") assert isinstance(trainer.accelerator, CPUAccelerator) @@ -482,7 +482,7 @@ def test_strategy_choice_ddp_torchelastic(_, __, mps_count_0, cuda_count_2): @mock.patch("torch.cuda.set_device") @mock.patch("lightning.pytorch.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_kubeflow(_, __, mps_count_0, cuda_count_2): - trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=2) + trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=2, plugins=KubeflowEnvironment()) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment) @@ -502,7 +502,7 @@ def test_strategy_choice_ddp_kubeflow(_, __, mps_count_0, cuda_count_2): ) @mock.patch("lightning.pytorch.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_cpu_kubeflow(cuda_count_0): - trainer = Trainer(fast_dev_run=True, accelerator="cpu", devices=2) + trainer = Trainer(fast_dev_run=True, accelerator="cpu", devices=2, plugins=KubeflowEnvironment()) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment)