Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dashboard] Remove gpustats dependencies from Ray[default] #41044

Merged
merged 9 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/env/check_minimal_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"aiohttp_cors",
"colorful",
"py-spy",
"gpustat",
"opencensus",
"prometheus_client",
"smart_open",
Expand Down
12 changes: 6 additions & 6 deletions dashboard/client/src/pages/node/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ const columns = [
helpInfo: (
<Typography>
Usage of each GPU device. If no GPU usage is detected, here are the
potential root causes: <br />
1. library gpustsat is not installed. Install gpustat and try again.
<br /> 2. non-GPU Ray image is used on this node. Switch to a GPU Ray
potential root causes:<br />
1. non-GPU Ray image is used on this node. Switch to a GPU Ray
image and try again. <br />
3. AMD GPUs are being used. AMD GPUs are not currently supported by
gpustat module. <br />
4. gpustat module raises an exception.
2. Non Nvidia GPUs are being used. Non Nvidia GPUs' utilizations are not
currently supported.
<br />
3. pynvml module raises an exception.
</Typography>
),
},
Expand Down
15 changes: 4 additions & 11 deletions dashboard/client/src/type/node.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,21 @@ export type NodeListRsp = {
msg: string;
};

export type GPUProcessStats = {
// Sub stat of GPU stats, this type represents the GPU
// utilization of a single process of a single GPU.
username: string;
command: string;
gpuMemoryUsage: number;
export type ProcessGPUUsage = {
// This gpu usage stats from a process
pid: number;
gpuMemoryUsage: number;
};

export type GPUStats = {
// This represents stats fetched from a node about a single GPU
uuid: string;
index: number;
name: string;
temperatureGpu: number;
fanSpeed: number;
utilizationGpu?: number;
powerDraw: number;
enforcedPowerLimit: number;
memoryUsed: number;
memoryTotal: number;
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
processes?: GPUProcessStats[];
processes?: ProcessGPUUsage[];
};

export type NodeDetailExtend = {
Expand Down
120 changes: 76 additions & 44 deletions dashboard/modules/reporter/reporter_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
import socket
import sys
import traceback
import warnings

import psutil

from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, TypedDict
from collections import defaultdict

import ray
Expand All @@ -28,8 +27,8 @@
from prometheus_client.core import REGISTRY
from ray._private.metrics_agent import Gauge, MetricsAgent, Record
from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS
import ray._private.thirdparty.pynvml as pynvml
from ray.core.generated import reporter_pb2, reporter_pb2_grpc
from ray.util.debug import log_once
from ray.dashboard import k8s_utils
from ray._raylet import WorkerID

Expand All @@ -48,25 +47,6 @@
# Using existence of /sys/fs/cgroup as the criterion is consistent with
# Ray's existing resource logic, see e.g. ray._private.utils.get_num_cpus().

try:
import gpustat.core as gpustat
except ModuleNotFoundError:
gpustat = None
if log_once("gpustat_import_warning"):
warnings.warn(
"`gpustat` package is not installed. GPU monitoring is "
"not available. To have full functionality of the "
"dashboard please install `pip install ray["
"default]`.)"
)
except ImportError as e:
gpustat = None
if log_once("gpustat_import_warning"):
warnings.warn(
"Importing gpustat failed, fix this to have full "
"functionality of the dashboard. The original error was:\n\n" + e.msg
)


def recursive_asdict(o):
if isinstance(o, tuple) and hasattr(o, "_asdict"):
Expand Down Expand Up @@ -268,6 +248,29 @@ def jsonify_asdict(o) -> str:
),
}

MB = 1024 * 1024

# Types
Percentage = int
Megabytes = int


# gpu utilization for nvidia gpu from a single process
class ProcessGPUInfo(TypedDict):
pid: int
gpu_memory_usage: int
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved


# gpu utilization for nvidia gpu
class GpuUtilizationInfo(TypedDict):
index: int
name: str
uuid: str
utilization_gpu: Optional[Percentage]
memory_used: Megabytes
memory_total: Megabytes
processes_pids: Optional[List[ProcessGPUInfo]]


class ReporterAgent(
dashboard_utils.DashboardAgentModule, reporter_pb2_grpc.ReporterServiceServicer
Expand Down Expand Up @@ -394,30 +397,66 @@ def _get_cpu_percent(in_k8s: bool):
@staticmethod
def _get_gpu_usage():
global enable_gpu_usage_check
if gpustat is None or not enable_gpu_usage_check:
if not enable_gpu_usage_check:
return []
gpu_utilizations = []
gpus = []
try:
gpus = gpustat.new_query().gpus
pynvml.nvmlInit()
num_gpus = pynvml.nvmlDeviceGetCount()
for i in range(num_gpus):
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
utilization = None
try:
utilization_info = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
utilization = int(utilization_info.gpu)
except pynvml.NVMLError as e:
logger.debug(f"pynvml failed to retrieve GPU utilization: {e}")

# processes pids
processes_pids = None
try:
nv_comp_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
jjyao marked this conversation as resolved.
Show resolved Hide resolved
gpu_handle
)
nv_graphics_processes = (
pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle)
)
processes_pids = [
ProcessGPUInfo(
pid=nv_process.pid,
gpu_memory_usage=nv_process.usedGpuMemory // MB
if nv_process.usedGpuMemory
else 0,
)
for nv_process in (nv_comp_processes + nv_graphics_processes)
]
except pynvml.NVMLError as e:
logger.debug(f"pynvml failed to retrieve GPU processes: {e}")

info = GpuUtilizationInfo(
index=i,
name=pynvml.nvmlDeviceGetName(gpu_handle),
uuid=pynvml.nvmlDeviceGetUUID(gpu_handle),
utilization_gpu=utilization,
memory_used=int(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).used)
// MB,
memory_total=int(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).total)
// MB,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could merge these two nvmlDeviceGetMemoryInfo calls into one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, in NVIDIA driver 510.39.01, a v2 memory info API was added:

https://github.com/NVIDIA/nvidia-settings/blob/510.39.01/src/nvml.h#L218-L241

The unversioned API (v1) and v2 API will return different results on R510+ drivers.

processes_pids=processes_pids,
)
gpu_utilizations.append(info)
pynvml.nvmlShutdown()
except Exception as e:
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
logger.debug(f"gpustat failed to retrieve GPU information: {e}")
logger.debug(f"pynvml failed to retrieve GPU information: {e}")

# gpustat calls pynvml.nvmlInit()
# On machines without GPUs, this can run subprocesses that spew to
# stderr. Then with log_to_driver=True, we get log spew from every
# On machines without GPUs, pynvml.nvmlInit() can run subprocesses that
# spew to stderr. Then with log_to_driver=True, we get log spew from every
# single raylet. To avoid this, disable the GPU usage check on
# certain errors.
# https://github.com/ray-project/ray/issues/14305
# https://github.com/ray-project/ray/pull/21686
if type(e).__name__ == "NVMLError_DriverNotLoaded":
enable_gpu_usage_check = False

for gpu in gpus:
# Note the keys in this dict have periods which throws
# off javascript so we change .s to _s
gpu_data = {"_".join(key.split(".")): val for key, val in gpu.entry.items()}
gpu_utilizations.append(gpu_data)
return gpu_utilizations

@staticmethod
Expand Down Expand Up @@ -911,21 +950,14 @@ def _record_stats(self, stats, cluster_stats):
)
records_reported.append(node_mem_shared)

# The output example of gpustats.
# The output example of GpuUtilizationInfo.
"""
{'index': 0,
'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
'name': 'NVIDIA A10G',
'temperature_gpu': 20,
'fan_speed': 0,
'utilization_gpu': 1,
'utilization_enc': 0,
'utilization_dec': 0,
'power_draw': 51,
'enforced_power_limit': 300,
'memory_used': 0,
'memory_total': 22731,
'processes': []}
'memory_total': 22731}
"""
# -- GPU per node --
gpus = stats["gpus"]
Expand Down
36 changes: 0 additions & 36 deletions dashboard/modules/reporter/tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,13 +359,7 @@ def test_report_stats_gpu():
{'index': 0,
'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
'name': 'NVIDIA A10G',
'temperature_gpu': 20,
'fan_speed': 0,
'utilization_gpu': 1,
'utilization_enc': 0,
'utilization_dec': 0,
'power_draw': 51,
'enforced_power_limit': 300,
'memory_used': 0,
'memory_total': 22731,
'processes': []}
Expand All @@ -376,13 +370,7 @@ def test_report_stats_gpu():
"index": 0,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b396",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 0,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 0,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -391,13 +379,7 @@ def test_report_stats_gpu():
"index": 1,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b397",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 1,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 1,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -406,13 +388,7 @@ def test_report_stats_gpu():
"index": 2,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 2,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 2,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -421,13 +397,7 @@ def test_report_stats_gpu():
{
"index": 3,
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 3,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 3,
"memory_total": GPU_MEMORY,
"processes": [],
Expand All @@ -436,13 +406,7 @@ def test_report_stats_gpu():
{
"uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
"name": "NVIDIA A10G",
"temperature_gpu": 20,
"fan_speed": 0,
"utilization_gpu": 3,
"utilization_enc": 0,
"utilization_dec": 0,
"power_draw": 51,
"enforced_power_limit": 300,
"memory_used": 3,
"memory_total": 22731,
"processes": [],
Expand Down
2 changes: 1 addition & 1 deletion python/ray/train/tests/test_gpu_auto_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def host_to_device_auto_pipeline(device):

def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
"""Tests that auto_transfer uses the right device for the cuda stream."""
import pynvml
import ray._private.thirdparty.pynvml as pynvml

pynvml.nvmlInit()

Expand Down
1 change: 0 additions & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ scipy
colorful
pyyaml
rich
gpustat>=1.0.0
opentelemetry-sdk
fastapi
gymnasium==0.28.1
Expand Down
1 change: 0 additions & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@ def get_packages(self):
"colorful",
"py-spy >= 0.2.0",
"requests",
"gpustat >= 1.0.0", # for windows
"grpcio >= 1.32.0; python_version < '3.10'", # noqa:E501
"grpcio >= 1.42.0; python_version >= '3.10'", # noqa:E501
"opencensus",
Expand Down
3 changes: 2 additions & 1 deletion src/ray/raylet/scheduling/cluster_resource_scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ ClusterResourceScheduler::ClusterResourceScheduler(
std::function<bool(scheduling::NodeID)> is_node_available_fn,
std::function<int64_t(void)> get_used_object_store_memory,
std::function<bool(void)> get_pull_manager_at_capacity,
const absl::flat_hash_map<std::string, std::string> &local_node_labels)
const absl::flat_hash_map<std::string, std::string>
&local_node_labels) // pass this to local resource manager
jonathan-anyscale marked this conversation as resolved.
Show resolved Hide resolved
: local_node_id_(local_node_id), is_node_available_fn_(is_node_available_fn) {
NodeResources node_resources = ResourceMapToNodeResources(
local_node_resources, local_node_resources, local_node_labels);
Expand Down
Loading