Skip to content

Commit

Permalink
[Core] Ray auto detect nvidia Gpu with pynvml (ray-project#41020)
Browse files Browse the repository at this point in the history
Signed-off-by: Jonathan Nitisastro <jonathancn@anyscale.com>
  • Loading branch information
jonathan-anyscale authored and ujjawal-khare committed Nov 29, 2023
1 parent 100314a commit 3759c27
Show file tree
Hide file tree
Showing 6 changed files with 5,069 additions and 89 deletions.
32 changes: 32 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -448,3 +448,35 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

--------------------------------------------------------------------------------

Code in python/ray/_private/thirdparty/pynvml is adapted from https://pypi.org/project/nvidia-ml-py

Copyright (c) 2011-2022, NVIDIA Corporation.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of staged-recipes nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
77 changes: 21 additions & 56 deletions python/ray/_private/accelerators/nvidia_gpu.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
import re
import os
import sys
import logging
import subprocess
import importlib
from typing import Optional, List, Tuple

try:
import GPUtil
except ImportError:
pass
import ray._private.thirdparty.pynvml as pynvml

from ray._private.accelerators.accelerator import AcceleratorManager

Expand Down Expand Up @@ -39,7 +32,6 @@ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
cuda_visible_devices = os.environ.get(
NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
)

if cuda_visible_devices is None:
return None

Expand All @@ -53,58 +45,31 @@ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:

@staticmethod
def get_current_node_num_accelerators() -> int:
num_gpus = 0
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
num_gpus = len(gpu_list)
elif sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if os.path.isdir(proc_gpus_path):
num_gpus = len(os.listdir(proc_gpus_path))
elif sys.platform == "win32":
props = "AdapterCompatibility"
cmdargs = ["WMIC", "PATH", "Win32_VideoController", "GET", props]
lines = subprocess.check_output(cmdargs).splitlines()[1:]
num_gpus = len([x.rstrip() for x in lines if x.startswith(b"NVIDIA")])
return num_gpus
try:
pynvml.nvmlInit()
except pynvml.NVMLError:
return 0 # pynvml init failed
device_count = pynvml.nvmlDeviceGetCount()
pynvml.nvmlShutdown()
return device_count

@staticmethod
def get_current_node_accelerator_type() -> Optional[str]:
try:
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
gpu_list_names.pop()
)
elif sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if not os.path.isdir(proc_gpus_path):
return None
gpu_dirs = os.listdir(proc_gpus_path)
if len(gpu_dirs) == 0:
return None
gpu_info_path = f"{proc_gpus_path}/{gpu_dirs[0]}/information"
info_str = open(gpu_info_path).read()
if not info_str:
return None
lines = info_str.split("\n")
full_model_name = None
for line in lines:
split = line.split(":")
if len(split) != 2:
continue
k, v = split
if k.strip() == "Model":
full_model_name = v.strip()
break
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
full_model_name
pynvml.nvmlInit()
except pynvml.NVMLError:
return None # pynvml init failed
device_count = pynvml.nvmlDeviceGetCount()
cuda_device_type = None
if device_count > 0:
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
cuda_device_type = (
NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
pynvml.nvmlDeviceGetName(handle)
)
except Exception:
logger.exception("Could not parse gpu information.")
return None
)
pynvml.nvmlShutdown()
return cuda_device_type

@staticmethod
def _gpu_name_to_accelerator_type(name):
Expand Down
3 changes: 3 additions & 0 deletions python/ray/_private/thirdparty/pynvml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ray._private.thirdparty.pynvml.pynvml import *
# current version
__version__ = "12.535.133"
Loading

0 comments on commit 3759c27

Please sign in to comment.