Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core][refactor] Move accelerator-specific environment variables to ray_constants.py to avoid redefining them #51026

Merged
merged 6 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,15 +422,6 @@ def env_set_by_user(key):

LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]

# Accelerator constants
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"

CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES"
NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"
NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same as ASCEND_RT_VISIBLE_DEVICES_ENV_VAR.


NEURON_CORES = "neuron_cores"
GPU = "GPU"
TPU = "TPU"
Expand Down
5 changes: 2 additions & 3 deletions python/ray/air/_internal/device_manager/npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import ray
import ray._private.ray_constants as ray_constants
from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
from ray._private.accelerators.npu import ASCEND_RT_VISIBLE_DEVICES_ENV_VAR


def is_package_present(package_name: str) -> bool:
Expand Down Expand Up @@ -54,9 +55,7 @@ def get_devices(self) -> List[torch.device]:
device_ids = []

if len(npu_ids) > 0:
npu_visible_str = os.environ.get(
ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, ""
)
npu_visible_str = os.environ.get(ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, "")
if npu_visible_str and npu_visible_str != "NoDevFiles":
npu_visible_list = npu_visible_str.split(",")
else:
Expand Down
3 changes: 2 additions & 1 deletion python/ray/dashboard/modules/job/job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ray._private.event.event_logger import get_event_logger
from ray._private.gcs_utils import GcsAioClient
from ray._private.utils import run_background_task
from ray._private.accelerators.nvidia_gpu import NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR
from ray.actor import ActorHandle
from ray.core.generated.event_pb2 import Event
from ray.dashboard.consts import (
Expand Down Expand Up @@ -359,7 +360,7 @@ def _get_supervisor_runtime_env(
# Don't set CUDA_VISIBLE_DEVICES for the supervisor actor so the
# driver can use GPUs if it wants to. This will be removed from
# the driver's runtime_env so it isn't inherited by tasks & actors.
env_vars[ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR] = "1"
env_vars[NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR] = "1"
runtime_env["env_vars"] = env_vars

if os.getenv(RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR, "0") == "1":
Expand Down
3 changes: 2 additions & 1 deletion python/ray/dashboard/modules/job/job_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter
from ray._private.runtime_env.constants import RAY_JOB_CONFIG_JSON_ENV_VAR
from ray._private.utils import remove_ray_internal_flags_from_env
from ray._private.accelerators.nvidia_gpu import NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR
from ray.actor import ActorHandle
from ray.dashboard.modules.job.common import (
JOB_ID_METADATA_KEY,
Expand Down Expand Up @@ -139,7 +140,7 @@ def _get_driver_runtime_env(
# Allow CUDA_VISIBLE_DEVICES to be set normally for the driver's tasks
# & actors.
env_vars = curr_runtime_env.get("env_vars", {})
env_vars.pop(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR)
env_vars.pop(NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR)
env_vars.pop(ray_constants.RAY_WORKER_NICENESS)
curr_runtime_env["env_vars"] = env_vars
return curr_runtime_env
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import ray

cuda_env = ray._private.ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR
cuda_env = ray._private.accelerators.nvidia_gpu.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR
if os.environ.get("RAY_TEST_RESOURCES_SPECIFIED") == "1":
assert cuda_env not in os.environ
assert "CUDA_VISIBLE_DEVICES" in os.environ
Expand Down
14 changes: 8 additions & 6 deletions python/ray/train/_internal/backend_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

import ray
import ray._private.ray_constants as ray_constants
from ray._private.accelerators.neuron import NEURON_RT_VISIBLE_CORES_ENV_VAR
from ray._private.accelerators.npu import ASCEND_RT_VISIBLE_DEVICES_ENV_VAR
from ray._private.accelerators.amd_gpu import ROCR_VISIBLE_DEVICES_ENV_VAR
from ray._private.accelerators.nvidia_gpu import CUDA_VISIBLE_DEVICES_ENV_VAR
from ray._private.ray_constants import env_integer
from ray.data import Dataset
from ray.exceptions import RayActorError
Expand Down Expand Up @@ -118,18 +122,18 @@ def __init__(
ResourceConfig(
ray_constants.NEURON_CORES,
ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR,
NEURON_RT_VISIBLE_CORES_ENV_VAR,
),
ResourceConfig(
ray_constants.NPU,
ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV,
ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR,
ASCEND_RT_VISIBLE_DEVICES_ENV_VAR,
),
# For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var.
ResourceConfig(
ray_constants.GPU,
ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV,
ray_constants.ROCR_VISIBLE_DEVICES_ENV_VAR,
ROCR_VISIBLE_DEVICES_ENV_VAR,
),
]

Expand Down Expand Up @@ -299,9 +303,7 @@ def _share_cuda_visible_devices(self):
- Worker3: "0,1"

"""
self._share_resource_ids(
ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR
)
self._share_resource_ids(ray_constants.GPU, CUDA_VISIBLE_DEVICES_ENV_VAR)

def _share_resource_ids(self, resource: str, env_var: str):
"""Sets the given env_var on all workers.
Expand Down
6 changes: 3 additions & 3 deletions python/ray/train/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest

import ray
import ray._private.ray_constants as ray_constants
from ray._private.accelerators.neuron import NEURON_RT_VISIBLE_CORES_ENV_VAR
from ray import train
from ray.air._internal.util import StartTraceback

Expand Down Expand Up @@ -493,7 +493,7 @@ def test_neuron_core_accelerator_ids(ray_2_node_2_neuron_cores, worker_results):
config = TestConfig()

def get_resources():
neuron_resource_ids = os.environ[ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR]
neuron_resource_ids = os.environ[NEURON_RT_VISIBLE_CORES_ENV_VAR]
# Sort the runtime ids to have exact match with expected result.
sorted_devices = [
int(device) for device in sorted(neuron_resource_ids.split(","))
Expand Down Expand Up @@ -530,7 +530,7 @@ def test_neuron_core_accelerator_ids_sharing_disabled(
config = TestConfig()

def get_resources():
neuron_resource_ids = os.environ[ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR]
neuron_resource_ids = os.environ[NEURON_RT_VISIBLE_CORES_ENV_VAR]
# Sort the runtime ids to have exact match with expected result.
sorted_devices = [
int(device) for device in sorted(neuron_resource_ids.split(","))
Expand Down
3 changes: 2 additions & 1 deletion python/ray/train/v2/_internal/callbacks/accelerators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import ray._private.ray_constants as ray_constants
from ray._private.ray_constants import env_bool
from ray._private.accelerators.nvidia_gpu import CUDA_VISIBLE_DEVICES_ENV_VAR
from ray.train import BackendConfig
from ray.train.constants import ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV
from ray.train.v2._internal.execution.callback import WorkerGroupCallback
Expand Down Expand Up @@ -63,7 +64,7 @@ def _share_cuda_visible_devices(worker_group: WorkerGroup):
- Worker2: "0,1"
"""
_share_accelerator_ids(
worker_group, ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR
worker_group, ray_constants.GPU, CUDA_VISIBLE_DEVICES_ENV_VAR
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@justinvyu PTAL for codeowner approval, and also can you help me understand why we depend on this env var directly?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to basically overwrite the Ray Core behavior of restricting CUDA_VISIBLE_DEVICES=[ray_gpu_ids] (where ray_gpu_ids is a list of devices assigned to the actor) to instead set the env var to the set of ALL devices on a node that are used by training workers in the group. NCCL needs this in order to do cross device communication during training.

)


Expand Down