Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dashboard] Remove gpustats dependencies from Ray[default] #41044

Merged
merged 9 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/env/check_minimal_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"aiohttp_cors",
"colorful",
"py-spy",
"gpustat",
"opencensus",
"prometheus_client",
"smart_open",
Expand Down
9 changes: 5 additions & 4 deletions dashboard/client/src/pages/node/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,13 @@ const columns = [
<Typography>
Usage of each GPU device. If no GPU usage is detected, here are the
potential root causes: <br />
1. library gpustsat is not installed. Install gpustat and try again.
1. library pynvml is not installed. Install pynvml and try again.
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
<br /> 2. non-GPU Ray image is used on this node. Switch to a GPU Ray
image and try again. <br />
3. AMD GPUs are being used. AMD GPUs are not currently supported by
gpustat module. <br />
4. gpustat module raises an exception.
3. Non Nvidia GPUs are being used. Non Nvidia GPUs' utilizations are not
currently supported.
<br />
4. pynvml module raises an exception.
</Typography>
),
},
Expand Down
15 changes: 4 additions & 11 deletions dashboard/client/src/type/node.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,21 @@ export type NodeListRsp = {
msg: string;
};

export type GPUProcessStats = {
// Sub stat of GPU stats, this type represents the GPU
// utilization of a single process of a single GPU.
username: string;
command: string;
gpuMemoryUsage: number;
export type ProcessGPUUsage = {
// This gpu usage stats from a process
pid: number;
gpuMemoryUsage: number;
};

export type GPUStats = {
// This represents stats fetched from a node about a single GPU
uuid: string;
index: number;
name: string;
temperatureGpu: number;
fanSpeed: number;
utilizationGpu?: number;
powerDraw: number;
enforcedPowerLimit: number;
memoryUsed: number;
memoryTotal: number;
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
processes?: GPUProcessStats[];
processes?: ProcessGPUUsage[];
};

export type NodeDetailExtend = {
Expand Down
118 changes: 74 additions & 44 deletions dashboard/modules/reporter/reporter_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
import socket
import sys
import traceback
import warnings

import psutil

from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, TypedDict
from collections import defaultdict

import ray
Expand All @@ -28,8 +27,8 @@
from prometheus_client.core import REGISTRY
from ray._private.metrics_agent import Gauge, MetricsAgent, Record
from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS
import ray._private.thirdparty.pynvml as pynvml
from ray.core.generated import reporter_pb2, reporter_pb2_grpc
from ray.util.debug import log_once
from ray.dashboard import k8s_utils
from ray._raylet import WorkerID

Expand All @@ -48,25 +47,6 @@
# Using existence of /sys/fs/cgroup as the criterion is consistent with
# Ray's existing resource logic, see e.g. ray._private.utils.get_num_cpus().

try:
import gpustat.core as gpustat
except ModuleNotFoundError:
gpustat = None
if log_once("gpustat_import_warning"):
warnings.warn(
"`gpustat` package is not installed. GPU monitoring is "
"not available. To have full functionality of the "
"dashboard please install `pip install ray["
"default]`.)"
)
except ImportError as e:
gpustat = None
if log_once("gpustat_import_warning"):
warnings.warn(
"Importing gpustat failed, fix this to have full "
"functionality of the dashboard. The original error was:\n\n" + e.msg
)


def recursive_asdict(o):
if isinstance(o, tuple) and hasattr(o, "_asdict"):
Expand Down Expand Up @@ -268,6 +248,29 @@ def jsonify_asdict(o) -> str:
),
}

MiB = 1024 * 1024

# Types
Percentage = int
Megabytes = int


# gpu utilization for nvidia gpu from a single process
class ProcessGPUInfo(TypedDict):
pid: int
gpu_memory_usage: int
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved


# gpu utilization for nvidia gpu
class GpuUtilizationInfo(TypedDict):
index: int
name: str
uuid: str
utilization_gpu: Optional[Percentage]
memory_used: Megabytes
memory_total: Megabytes
processes_pids: Optional[List[ProcessGPUInfo]]


class ReporterAgent(
dashboard_utils.DashboardAgentModule, reporter_pb2_grpc.ReporterServiceServicer
Expand Down Expand Up @@ -394,30 +397,64 @@ def _get_cpu_percent(in_k8s: bool):
@staticmethod
def _get_gpu_usage():
global enable_gpu_usage_check
if gpustat is None or not enable_gpu_usage_check:
if pynvml is None or not enable_gpu_usage_check:
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
return []
gpu_utilizations = []
gpus = []
try:
gpus = gpustat.new_query().gpus
pynvml.nvmlInit()
num_gpus = pynvml.nvmlDeviceGetCount()
for i in range(num_gpus):
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
utilization = None
try:
utilization_info = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
utilization = int(utilization_info.gpu)
except pynvml.NVMLError as e:
logger.debug(f"pynvml failed to retrieve GPU utilization: {e}")

# processes pids
processes_pids = None
try:
nv_comp_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
jjyao marked this conversation as resolved.
Show resolved Hide resolved
gpu_handle
)
nv_graphics_processes = (
pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle)
)
processes_pids = [
ProcessGPUInfo(
pid=nv_process.pid,
gpu_memory_usage=nv_process.usedGpuMemory // MiB
if nv_process.usedGpuMemory
else 0,
)
for nv_process in (nv_comp_processes + nv_graphics_processes)
]
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
except pynvml.NVMLError as e:
logger.debug(f"pynvml failed to retrieve GPU processes: {e}")

info = GpuUtilizationInfo(
index=i,
name=pynvml.nvmlDeviceGetName(gpu_handle),
uuid=pynvml.nvmlDeviceGetUUID(gpu_handle),
utilization_gpu=utilization,
memory_used=int(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).used),
memory_total=int(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).total),
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
processes_pids=processes_pids,
)
gpu_utilizations.append(info)
pynvml.nvmlShutdown()
except Exception as e:
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
logger.debug(f"gpustat failed to retrieve GPU information: {e}")
logger.debug(f"pynvml failed to retrieve GPU information: {e}")

# gpustat calls pynvml.nvmlInit()
# On machines without GPUs, this can run subprocesses that spew to
# stderr. Then with log_to_driver=True, we get log spew from every
# On machines without GPUs, pynvml.nvmlInit() can run subprocesses that
# spew to stderr. Then with log_to_driver=True, we get log spew from every
# single raylet. To avoid this, disable the GPU usage check on
# certain errors.
# https://github.com/ray-project/ray/issues/14305
# https://github.com/ray-project/ray/pull/21686
if type(e).__name__ == "NVMLError_DriverNotLoaded":
enable_gpu_usage_check = False

for gpu in gpus:
# Note the keys in this dict have periods which throws
# off javascript so we change .s to _s
gpu_data = {"_".join(key.split(".")): val for key, val in gpu.entry.items()}
gpu_utilizations.append(gpu_data)
return gpu_utilizations

@staticmethod
Expand Down Expand Up @@ -911,21 +948,14 @@ def _record_stats(self, stats, cluster_stats):
)
records_reported.append(node_mem_shared)

# The output example of gpustats.
# The output example of GpuUtilizationInfo.
"""
{'index': 0,
'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
'name': 'NVIDIA A10G',
'temperature_gpu': 20,
'fan_speed': 0,
'utilization_gpu': 1,
'utilization_enc': 0,
'utilization_dec': 0,
'power_draw': 51,
'enforced_power_limit': 300,
'memory_used': 0,
'memory_total': 22731,
'processes': []}
'memory_total': 22731}
"""
# -- GPU per node --
gpus = stats["gpus"]
Expand Down
36 changes: 0 additions & 36 deletions dashboard/modules/reporter/tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,13 +359,7 @@ def test_report_stats_gpu():
{'index': 0,
'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
'name': 'NVIDIA A10G',
'temperature_gpu': 20,
'fan_speed': 0,
'utilization_gpu': 1,
'utilization_enc': 0,
'utilization_dec': 0,
'power_draw': 51,
'enforced_power_limit': 300,
'memory_used': 0,
'memory_total': 22731,
'processes': []}
Expand All @@ -376,13 +370,7 @@ def test_report_stats_gpu():
"index": 0,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b396",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 0,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 0,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -391,13 +379,7 @@ def test_report_stats_gpu():
"index": 1,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b397",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 1,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 1,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -406,13 +388,7 @@ def test_report_stats_gpu():
"index": 2,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 2,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 2,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -421,13 +397,7 @@ def test_report_stats_gpu():
{
"index": 3,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 3,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 3,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -436,13 +406,7 @@ def test_report_stats_gpu():
{
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 3,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 3,
"memory_total": 22731,
"processes": [],
Expand Down
2 changes: 1 addition & 1 deletion python/ray/train/tests/test_gpu_auto_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def host_to_device_auto_pipeline(device):

def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
"""Tests that auto_transfer uses the right device for the cuda stream."""
import pynvml
import ray._private.thirdparty.pynvml as pynvml

pynvml.nvmlInit()

Expand Down
1 change: 0 additions & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ scipy
colorful
pyyaml
rich
gpustat>=1.0.0
opentelemetry-sdk
fastapi
gymnasium==0.28.1
Expand Down
1 change: 0 additions & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@ def get_packages(self):
"colorful",
"py-spy >= 0.2.0",
"requests",
"gpustat >= 1.0.0", # for windows
"grpcio >= 1.32.0; python_version < '3.10'", # noqa:E501
"grpcio >= 1.42.0; python_version >= '3.10'", # noqa:E501
"opencensus",
Expand Down
Loading