Skip to content

Commit

Permalink
Add cuDF spilling statistics to RMM/GPU memory plot (#8148)
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesbluca authored Dec 18, 2023
1 parent 8c3eb6f commit b44e661
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 135 deletions.
4 changes: 4 additions & 0 deletions continuous_integration/gpuci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,7 @@ conda list --show-channel-urls

rapids-logger "Python py.test for distributed"
py.test distributed -v -m gpu --runslow --junitxml="$WORKSPACE/junit-distributed.xml"

# cuDF spill stats monitoring must be enabled for this test
CUDF_SPILL=on CUDF_SPILL_STATS=1 DASK_DISTRIBUTED__DIAGNOSTICS__CUDF=1 \
py.test distributed/diagnostics/tests/test_cudf_diagnostics.py -v -m gpu --runslow --junitxml="$WORKSPACE/junit-distributed.xml"
262 changes: 134 additions & 128 deletions distributed/dashboard/components/rmm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import math
from textwrap import dedent
from collections.abc import Iterable
from typing import TypeVar

from bokeh.core.properties import without_property_validation
from bokeh.models import (
Expand All @@ -10,6 +10,7 @@
HoverTool,
NumeralTickFormatter,
OpenURL,
Range1d,
TapTool,
)
from bokeh.plotting import figure
Expand All @@ -18,12 +19,19 @@
from dask.utils import format_bytes

from distributed.dashboard.components import DashboardComponent, add_periodic_callback
from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
from distributed.dashboard.components.scheduler import (
BOKEH_THEME,
TICKS_1024,
XLABEL_ORIENTATION,
MemoryColor,
)
from distributed.dashboard.utils import update
from distributed.utils import log_errors

T = TypeVar("T")


class RMMMemoryUsage(DashboardComponent):
class RMMMemoryUsage(DashboardComponent, MemoryColor):
"""
GPU memory usage plot that includes information about memory
managed by RMM. If an RMM pool is being used, shows the amount of
Expand All @@ -32,168 +40,166 @@ class RMMMemoryUsage(DashboardComponent):

@log_errors
def __init__(self, scheduler, width=600, **kwargs):
DashboardComponent.__init__(self)
MemoryColor.__init__(self, neutral_color="#76B900")

self.last = 0
self.scheduler = scheduler
self.source = ColumnDataSource(
{
"rmm-used": [1, 2],
"rmm-used-half": [0.5, 1],
"rmm-total": [2, 4],
"rmm-total-half": [1, 2],
"external-used": [2, 1],
"external-used-x": [3, 4.5],
"worker": ["a", "b"],
"gpu-index": [0, 0],
"y": [1, 2],
"escaped_worker": ["a", "b"],
"rmm_memory_text": [
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
],
"width": [],
"x": [],
"y": [],
"color": [],
"alpha": [],
"worker": [],
"escaped_worker": [],
"rmm_used": [],
"rmm_total": [],
"gpu_used": [],
"gpu_total": [],
"spilled": [],
}
)

memory = figure(
title="RMM Memory",
self.root = figure(
title="RMM memory used",
tools="",
width=int(width / 2),
name="rmm_memory_histogram",
name="rmm_memory",
**kwargs,
)

rect = memory.rect(
source=self.source,
x="rmm-used-half",
y="y",
width="rmm-used",
height=1,
color="#76B900",
alpha=1.0,
)
rect.nonselection_glyph = None

rect = memory.rect(
rect = self.root.rect(
source=self.source,
x="rmm-total-half",
x="x",
y="y",
width="rmm-total",
height=1,
color="#76B900",
alpha=0.75,
width="width",
height=0.9,
color="color",
fill_alpha="alpha",
line_width=0,
)
rect.nonselection_glyph = None

rect = memory.rect(
source=self.source,
x="external-used-x",
y="y",
width="external-used",
height=1,
color="#76B900",
alpha=0.5,
self.root.axis[0].ticker = BasicTicker(**TICKS_1024)
self.root.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
self.root.xaxis.major_label_orientation = XLABEL_ORIENTATION
self.root.xaxis.minor_tick_line_alpha = 0
self.root.x_range = Range1d(start=0)
self.root.yaxis.visible = False
self.root.ygrid.visible = False
self.root.toolbar_location = None

tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
self.root.add_tools(tap)

hover = HoverTool(
point_policy="follow_mouse",
tooltips="""
<div>
<span style="font-size: 12px; font-weight: bold;">Worker:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@worker</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">RMM memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@rmm_used{0.00 b} / @rmm_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">GPU memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@gpu_used{0.00 b} / @gpu_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">Spilled to CPU:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@spilled{0.00 b}</span>
</div>
""",
)
rect.nonselection_glyph = None

memory.axis[0].ticker = BasicTicker(**TICKS_1024)
memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
memory.xaxis.major_label_orientation = -math.pi / 12
memory.x_range.start = 0

for fig in [memory]:
fig.xaxis.minor_tick_line_alpha = 0
fig.yaxis.visible = False
fig.ygrid.visible = False

tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
fig.add_tools(tap)

fig.toolbar_location = None
fig.yaxis.visible = False

hover = HoverTool()
hover.tooltips = "@worker : @rmm_memory_text"
hover.point_policy = "follow_mouse"
memory.add_tools(hover)

self.memory_figure = memory
self.root.add_tools(hover)

@without_property_validation
@log_errors
def update(self):
def quadlist(i: Iterable[T]) -> list[T]:
out = []
for ii in i:
out += [ii, ii, ii, ii]
return out

workers = list(self.scheduler.workers.values())
rmm_total = []

width = []
x = []
color = []
max_limit = 0
rmm_used = []
external_used = []
gpu_index = []
y = []
worker = []
external_used_x = []
memory_max = 0
rmm_total = []
gpu_used = []
gpu_total = []
rmm_memory_text = []
spilled = []

for idx, ws in enumerate(workers):
for ws in workers:
try:
rmm_metrics = ws.metrics["rmm"]
gpu_metrics = ws.metrics["gpu"]
gpu_info = ws.extra["gpu"]
except KeyError:
continue
rmm_total_worker = rmm_metrics["rmm-total"] # RMM memory only
rmm_used_worker = rmm_metrics["rmm-used"]
gpu_total_worker = gpu_info["memory-total"] # All GPU memory
gpu_used_worker = gpu_metrics["memory-used"]
rmm_metrics = {"rmm-used": 0, "rmm-total": 0}
gpu_metrics = {"memory-used": 0}
gpu_info = {"memory-total": 0}

try:
cudf_metrics = ws.metrics["cudf"]
except KeyError:
cudf_metrics = {"cudf-spilled": 0}

external_used_worker = gpu_used_worker - rmm_total_worker
rmm_used_worker = rmm_metrics["rmm-used"] # RMM memory only
rmm_total_worker = rmm_metrics["rmm-total"]
gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory
gpu_total_worker = gpu_info["memory-total"]
spilled_worker = cudf_metrics["cudf-spilled"] or 0 # memory spilled to host

rmm_total.append(rmm_total_worker)
max_limit = max(
max_limit, gpu_total_worker, gpu_used_worker + spilled_worker
)
color_i = self._memory_color(gpu_used_worker, gpu_total_worker, ws.status)

width += [
rmm_used_worker,
rmm_total_worker - rmm_used_worker,
gpu_used_worker - rmm_total_worker,
spilled_worker,
]
x += [sum(width[-4:i]) + width[i] / 2 for i in range(-4, 0)]
color += [color_i, color_i, color_i, "grey"]

# memory info
rmm_used.append(rmm_used_worker)
rmm_total.append(rmm_total_worker)
gpu_used.append(gpu_used_worker)
gpu_total.append(gpu_total_worker)
external_used.append(external_used_worker)
external_used_x.append(rmm_total_worker + external_used_worker / 2)
worker.append(ws.address)
gpu_index.append(idx)
y.append(idx)

memory_max = max(memory_max, gpu_total_worker)

rmm_memory_text.append(
"RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format(
format_bytes(rmm_used_worker),
format_bytes(rmm_total_worker),
format_bytes(gpu_used_worker),
format_bytes(gpu_total_worker),
)
)
spilled.append(spilled_worker)

self.memory_figure.title.text = dedent(
"""\
RMM Utilization: {} / {}
GPU Memory: {} / {}
""".format(
format_bytes(sum(rmm_used)),
format_bytes(sum(rmm_total)),
format_bytes(sum([*rmm_total, *external_used])),
format_bytes(sum(gpu_total)),
)
)
title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nGPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
if sum(spilled):
title += f" + {format_bytes(sum(spilled))} spilled to CPU"
self.root.title.text = title

result = {
"rmm-total": rmm_total,
"rmm-used": rmm_used,
"external-used": external_used,
"rmm-total-half": [m // 2 for m in rmm_total],
"rmm-used-half": [m // 2 for m in rmm_used],
"external-used-x": external_used_x,
"worker": worker,
"gpu-index": gpu_index,
"y": y,
"escaped_worker": [escape.url_escape(w) for w in worker],
"rmm_memory_text": rmm_memory_text,
"width": width,
"x": x,
"y": quadlist(range(len(workers))),
"color": color,
"alpha": [1, 0.7, 0.4, 1] * len(workers),
"worker": quadlist(ws.address for ws in workers),
"escaped_worker": quadlist(escape.url_escape(ws.address) for ws in workers),
"rmm_used": quadlist(rmm_used),
"rmm_total": quadlist(rmm_total),
"gpu_used": quadlist(gpu_used),
"gpu_total": quadlist(gpu_total),
"spilled": quadlist(spilled),
}

self.memory_figure.x_range.end = memory_max

self.root.x_range.end = max_limit
update(self.source, result)


Expand All @@ -202,5 +208,5 @@ def rmm_memory_doc(scheduler, extra, doc):
rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
rmm_load.update()
add_periodic_callback(doc, rmm_load, 100)
doc.add_root(rmm_load.memory_figure)
doc.add_root(rmm_load.root)
doc.theme = BOKEH_THEME
19 changes: 13 additions & 6 deletions distributed/dashboard/components/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,25 +276,32 @@ class MemoryColor:
orange: float
red: float

def __init__(self):
def __init__(
self, neutral_color="blue", target_color="orange", terminated_color="red"
):
self.neutral_color = neutral_color
self.target_color = target_color
self.terminated_color = terminated_color

target = dask.config.get("distributed.worker.memory.target")
spill = dask.config.get("distributed.worker.memory.spill")
terminate = dask.config.get("distributed.worker.memory.terminate")

# These values can be False. It's also common to configure them to impossibly
# high values to achieve the same effect.
self.orange = min(target or math.inf, spill or math.inf)
self.red = min(terminate or math.inf, 1.0)

def _memory_color(self, current: int, limit: int, status: Status) -> str:
if status != Status.running:
return "red"
return self.terminated_color
if not limit:
return "blue"
return self.neutral_color
if current >= limit * self.red:
return "red"
return self.terminated_color
if current >= limit * self.orange:
return "orange"
return "blue"
return self.target_color
return self.neutral_color


class ClusterMemory(DashboardComponent, MemoryColor):
Expand Down
Loading

0 comments on commit b44e661

Please sign in to comment.