Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Ray auto detect nvidia Gpu with pynvml #41020

Merged
merged 7 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -448,3 +448,35 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

--------------------------------------------------------------------------------

Code in python/ray/_private/thirdparty/pynvml is adapted from https://pypi.org/project/nvidia-ml-py

Copyright (c) 2011-2022, NVIDIA Corporation.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of staged-recipes nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130 changes: 77 additions & 53 deletions python/ray/_private/accelerators/nvidia_gpu.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import re
import os
import sys
import logging
import subprocess
import importlib
from typing import Optional, List, Tuple

try:
import GPUtil
except ImportError:
pass
import ray._private.thirdparty.pynvml as pynvml
from packaging.version import Version

from ray._private.accelerators.accelerator import AcceleratorManager
import ray._private.ray_constants as ray_constants

logger = logging.getLogger(__name__)

Expand All @@ -22,6 +17,9 @@
# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")

# version with mig uuid
MIG_UUID_DRIVER_VERSION = "470.42.01"


class NvidiaGPUAcceleratorManager(AcceleratorManager):
"""Nvidia GPU accelerators."""
Expand All @@ -39,7 +37,6 @@ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
cuda_visible_devices = os.environ.get(
NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
)

if cuda_visible_devices is None:
return None

Expand All @@ -53,55 +50,82 @@ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:

@staticmethod
def get_current_node_num_accelerators() -> int:
num_gpus = 0
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
num_gpus = len(gpu_list)
elif sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if os.path.isdir(proc_gpus_path):
num_gpus = len(os.listdir(proc_gpus_path))
elif sys.platform == "win32":
props = "AdapterCompatibility"
cmdargs = ["WMIC", "PATH", "Win32_VideoController", "GET", props]
lines = subprocess.check_output(cmdargs).splitlines()[1:]
num_gpus = len([x.rstrip() for x in lines if x.startswith(b"NVIDIA")])
return num_gpus
try:
pynvml.nvmlInit()
except pynvml.NVMLError:
return 0 # pynvml init failed
driver_version = pynvml.nvmlSystemGetDriverVersion()
device_count = pynvml.nvmlDeviceGetCount()
cuda_devices = []
for index in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
mig_enabled = os.environ.get(
ray_constants.RAY_ENABLE_MIG_DETECTION_ENV_VAR, False
)
if mig_enabled:
try:
max_mig_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
except pynvml.NVMLError_NotSupported:
cuda_devices.append(str(index))
continue
for mig_index in range(max_mig_count):
try:
mig_handle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
handle, mig_index
)
mig_uuid = ""
if Version(driver_version) >= Version(MIG_UUID_DRIVER_VERSION):
mig_uuid = pynvml.nvmlDeviceGetUUID(mig_handle)
else:
mig_uuid = (
f"MIG-{pynvml.nvmlDeviceGetUUID(handle)}"
f"/{pynvml.nvmlDeviceGetComputeInstanceId(mig_handle)}"
f"/{pynvml.nvmlDeviceGetGpuInstanceId(mig_handle)}"
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
)
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
cuda_devices.append(mig_uuid)
except pynvml.NVMLError:
break
else:
cuda_devices.append(str(index))
os.environ[
NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var()
] = ",".join(cuda_devices)
pynvml.nvmlShutdown()
return len(cuda_devices)

@staticmethod
def get_current_node_accelerator_type() -> Optional[str]:
try:
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
gpu_list_names.pop()
)
elif sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if not os.path.isdir(proc_gpus_path):
return None
gpu_dirs = os.listdir(proc_gpus_path)
if len(gpu_dirs) == 0:
return None
gpu_info_path = f"{proc_gpus_path}/{gpu_dirs[0]}/information"
info_str = open(gpu_info_path).read()
if not info_str:
return None
lines = info_str.split("\n")
full_model_name = None
for line in lines:
split = line.split(":")
if len(split) != 2:
continue
k, v = split
if k.strip() == "Model":
full_model_name = v.strip()
break
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
full_model_name
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
cuda_devices_names = []
for index in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
mig_enabled = os.environ.get(
ray_constants.RAY_ENABLE_MIG_DETECTION_ENV_VAR, False
)
if mig_enabled:
try:
max_mig_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
except pynvml.NVMLError_NotSupported:
cuda_devices_names.append(pynvml.nvmlDeviceGetName(handle))
continue
for mig_index in range(max_mig_count):
try:
mig_handle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
handle, mig_index
)
cuda_devices_names.append(
pynvml.nvmlDeviceGetName(mig_handle)
)
except pynvml.NVMLError:
break
else:
cuda_devices_names.append(pynvml.nvmlDeviceGetName(handle))
pynvml.nvmlShutdown()
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
cuda_devices_names.pop()
)
except Exception:
logger.exception("Could not parse gpu information.")
return None
Expand Down
2 changes: 2 additions & 0 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,3 +480,5 @@ def gcs_actor_scheduling_enabled():


RAY_NODE_IP_FILENAME = "node_ip_address.json"

RAY_ENABLE_MIG_DETECTION_ENV_VAR = "RAY_ENABLE_MIG_DETECTION"
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
3 changes: 3 additions & 0 deletions python/ray/_private/resource_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ def resolve(self, is_head: bool, node_ip_address: Optional[str] = None):
num_accelerators = (
accelerator_manager.get_current_node_num_accelerators()
)
visible_accelerator_ids = (
accelerator_manager.get_current_process_visible_accelerator_ids()
)
# Don't use more accelerators than allowed by visible accelerator ids.
if visible_accelerator_ids is not None:
num_accelerators = min(
Expand Down
1 change: 1 addition & 0 deletions python/ray/_private/thirdparty/pynvml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ray._private.thirdparty.pynvml.pynvml import *
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading