Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuDF spilling statistics to RMM/GPU memory plot #8148

Merged
merged 26 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7fc10ce
Initial exposure of cuDF logging information
charlesbluca Aug 30, 2023
04137ef
Initial plot of GPU to CPU nbytes
charlesbluca Aug 31, 2023
d38de06
Refactor RMM plot to include spilled memory
charlesbluca Sep 1, 2023
eeddf1e
Fix memory limit on x axis
charlesbluca Sep 1, 2023
043835c
Remove unused dashboard plot
charlesbluca Sep 1, 2023
0ac3344
Allow MemoryColor colors to be overridden
charlesbluca Sep 6, 2023
698be13
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 6, 2023
bb49135
Linting
charlesbluca Sep 6, 2023
628fe39
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 7, 2023
50e626a
Add cudf diagnostics test
charlesbluca Sep 7, 2023
98e283e
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 11, 2023
008cca8
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 12, 2023
082ddef
Resolve bokeh test failures
charlesbluca Sep 12, 2023
87f8020
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 19, 2023
4453b8b
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 26, 2023
b60173d
Make cudf spilling monitoring optional and disabled by default
charlesbluca Sep 28, 2023
7426548
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 28, 2023
1890136
Modify cudf spilling test
charlesbluca Sep 28, 2023
5eafddc
Test cuDF spill tests in separate process
charlesbluca Sep 29, 2023
21e106b
Remove global cuDF spilling settings from build.sh
charlesbluca Oct 2, 2023
3cc4b94
cuDF metrics test is flaky
charlesbluca Oct 3, 2023
98dbfc7
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Oct 10, 2023
a5fce3c
Merge remote-tracking branch 'upstream/main' into cudf-spilling-dashb…
charlesbluca Oct 25, 2023
b2fdfc6
Shouldn't need dask-cuda worker for test
charlesbluca Oct 25, 2023
f96b8a4
Merge remote-tracking branch 'upstream/main' into pr/charlesbluca/8148
charlesbluca Dec 11, 2023
936f0f6
Merge remote-tracking branch 'upstream/main' into cudf-spilling-dashb…
charlesbluca Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions continuous_integration/gpuci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ export CUDA_REL=${CUDA_VERSION%.*}
# FIXME - monitoring GIL contention causes UCX teardown issues
export DASK_DISTRIBUTED__ADMIN__SYSTEM_MONITOR__GIL__ENABLED=False

# enable monitoring of cuDF spilling
export CUDF_SPILL=on
export CUDF_SPILL_STATS=1
export DASK_DISTRIBUTED__DIAGNOSTICS__CUDF=1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially set added this config variable to get test_no_unnecessary_imports_on_worker[scipy] and test_malloc_trim_threshold passing in GPU CI, but it seems like there isn't a trivial way to enable/disable cuDF spilling monitoring on a per-test basis.

Is there a way that we could somehow achieve this, or would it make sense to just not run these specific tests on GPU if we're expecting users to have cuDF installed on the workers?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would doing the same this test work? If not, maybe our only option would be to launch the test in a separate process so that we can have full control of environment variables before import cudf.



################################################################################
# SETUP - Check environment
################################################################################
Expand Down
352 changes: 178 additions & 174 deletions distributed/dashboard/components/rmm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import math
from textwrap import dedent
from collections.abc import Iterable
from typing import TypeVar

from bokeh.core.properties import without_property_validation
from bokeh.models import (
Expand All @@ -10,6 +10,7 @@
HoverTool,
NumeralTickFormatter,
OpenURL,
Range1d,
TapTool,
)
from bokeh.plotting import figure
Expand All @@ -18,191 +19,194 @@
from dask.utils import format_bytes

from distributed.dashboard.components import DashboardComponent, add_periodic_callback
from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
from distributed.dashboard.components.scheduler import (
BOKEH_THEME,
TICKS_1024,
XLABEL_ORIENTATION,
MemoryColor,
)
from distributed.dashboard.utils import update
from distributed.utils import log_errors

T = TypeVar("T")


class RMMMemoryUsage(DashboardComponent):
class RMMMemoryUsage(DashboardComponent, MemoryColor):
"""
GPU memory usage plot that includes information about memory
managed by RMM. If an RMM pool is being used, shows the amount of
pool memory utilized.
"""

@log_errors
def __init__(self, scheduler, width=600, **kwargs):
with log_errors():
self.last = 0
self.scheduler = scheduler
self.source = ColumnDataSource(
{
"rmm-used": [1, 2],
"rmm-used-half": [0.5, 1],
"rmm-total": [2, 4],
"rmm-total-half": [1, 2],
"external-used": [2, 1],
"external-used-x": [3, 4.5],
"worker": ["a", "b"],
"gpu-index": [0, 0],
"y": [1, 2],
"escaped_worker": ["a", "b"],
"rmm_memory_text": [
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
],
}
)

memory = figure(
title="RMM Memory",
tools="",
width=int(width / 2),
name="rmm_memory_histogram",
**kwargs,
)

rect = memory.rect(
source=self.source,
x="rmm-used-half",
y="y",
width="rmm-used",
height=1,
color="#76B900",
alpha=1.0,
)
rect.nonselection_glyph = None

rect = memory.rect(
source=self.source,
x="rmm-total-half",
y="y",
width="rmm-total",
height=1,
color="#76B900",
alpha=0.75,
)
rect.nonselection_glyph = None

rect = memory.rect(
source=self.source,
x="external-used-x",
y="y",
width="external-used",
height=1,
color="#76B900",
alpha=0.5,
)
rect.nonselection_glyph = None

memory.axis[0].ticker = BasicTicker(**TICKS_1024)
memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
memory.xaxis.major_label_orientation = -math.pi / 12
memory.x_range.start = 0

for fig in [memory]:
fig.xaxis.minor_tick_line_alpha = 0
fig.yaxis.visible = False
fig.ygrid.visible = False

tap = TapTool(
callback=OpenURL(url="./info/worker/@escaped_worker.html")
)
fig.add_tools(tap)

fig.toolbar_location = None
fig.yaxis.visible = False

hover = HoverTool()
hover.tooltips = "@worker : @rmm_memory_text"
hover.point_policy = "follow_mouse"
memory.add_tools(hover)

self.memory_figure = memory
DashboardComponent.__init__(self)
MemoryColor.__init__(self, neutral_color="#76B900")

self.last = 0
self.scheduler = scheduler
self.source = ColumnDataSource(
{
"width": [],
"x": [],
"y": [],
"color": [],
"alpha": [],
"worker": [],
"escaped_worker": [],
"rmm_used": [],
"rmm_total": [],
"gpu_used": [],
"gpu_total": [],
"spilled": [],
}
)

self.root = figure(
title="RMM memory used",
tools="",
width=int(width / 2),
name="rmm_memory",
**kwargs,
)
rect = self.root.rect(
source=self.source,
x="x",
y="y",
width="width",
height=0.9,
color="color",
fill_alpha="alpha",
line_width=0,
)
rect.nonselection_glyph = None

self.root.axis[0].ticker = BasicTicker(**TICKS_1024)
self.root.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
self.root.xaxis.major_label_orientation = XLABEL_ORIENTATION
self.root.xaxis.minor_tick_line_alpha = 0
self.root.x_range = Range1d(start=0)
self.root.yaxis.visible = False
self.root.ygrid.visible = False
self.root.toolbar_location = None

tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
self.root.add_tools(tap)

hover = HoverTool(
point_policy="follow_mouse",
tooltips="""
<div>
<span style="font-size: 12px; font-weight: bold;">Worker:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@worker</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">RMM memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@rmm_used{0.00 b} / @rmm_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">GPU memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@gpu_used{0.00 b} / @gpu_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">Spilled to CPU:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@spilled{0.00 b}</span>
</div>
""",
)
self.root.add_tools(hover)

@without_property_validation
@log_errors
def update(self):
with log_errors():
workers = list(self.scheduler.workers.values())
rmm_total = []
rmm_used = []
external_used = []
gpu_index = []
y = []
worker = []
external_used_x = []
memory_max = 0
gpu_total = []
rmm_memory_text = []

for idx, ws in enumerate(workers):
try:
rmm_metrics = ws.metrics["rmm"]
gpu_metrics = ws.metrics["gpu"]
gpu_info = ws.extra["gpu"]
except KeyError:
continue
rmm_total_worker = rmm_metrics["rmm-total"] # RMM memory only
rmm_used_worker = rmm_metrics["rmm-used"]
gpu_total_worker = gpu_info["memory-total"] # All GPU memory
gpu_used_worker = gpu_metrics["memory-used"]

external_used_worker = gpu_used_worker - rmm_total_worker

rmm_total.append(rmm_total_worker)
rmm_used.append(rmm_used_worker)
gpu_total.append(gpu_total_worker)
external_used.append(external_used_worker)
external_used_x.append(rmm_total_worker + external_used_worker / 2)
worker.append(ws.address)
gpu_index.append(idx)
y.append(idx)

memory_max = max(memory_max, gpu_total_worker)

rmm_memory_text.append(
"RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format(
format_bytes(rmm_used_worker),
format_bytes(rmm_total_worker),
format_bytes(gpu_used_worker),
format_bytes(gpu_total_worker),
)
)

self.memory_figure.title.text = dedent(
"""\
RMM Utilization: {} / {}
GPU Memory: {} / {}
""".format(
format_bytes(sum(rmm_used)),
format_bytes(sum(rmm_total)),
format_bytes(sum([*rmm_total, *external_used])),
format_bytes(sum(gpu_total)),
)
def quadlist(i: Iterable[T]) -> list[T]:
out = []
for ii in i:
out += [ii, ii, ii, ii]
return out

workers = list(self.scheduler.workers.values())

width = []
x = []
color = []
max_limit = 0
rmm_used = []
rmm_total = []
gpu_used = []
gpu_total = []
spilled = []

for ws in workers:
try:
rmm_metrics = ws.metrics["rmm"]
gpu_metrics = ws.metrics["gpu"]
gpu_info = ws.extra["gpu"]

Check warning on line 144 in distributed/dashboard/components/rmm.py

View check run for this annotation

Codecov / codecov/patch

distributed/dashboard/components/rmm.py#L143-L144

Added lines #L143 - L144 were not covered by tests
except KeyError:
rmm_metrics = {"rmm-used": 0, "rmm-total": 0}
gpu_metrics = {"memory-used": 0}
gpu_info = {"memory-total": 0}

try:
cudf_metrics = ws.metrics["cudf"]
except KeyError:
cudf_metrics = {"cudf-spilled": 0}

rmm_used_worker = rmm_metrics["rmm-used"] # RMM memory only
rmm_total_worker = rmm_metrics["rmm-total"]
gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory
gpu_total_worker = gpu_info["memory-total"]
spilled_worker = cudf_metrics["cudf-spilled"] or 0 # memory spilled to host

max_limit = max(
max_limit, gpu_total_worker, gpu_used_worker + spilled_worker
)

result = {
"rmm-total": rmm_total,
"rmm-used": rmm_used,
"external-used": external_used,
"rmm-total-half": [m // 2 for m in rmm_total],
"rmm-used-half": [m // 2 for m in rmm_used],
"external-used-x": external_used_x,
"worker": worker,
"gpu-index": gpu_index,
"y": y,
"escaped_worker": [escape.url_escape(w) for w in worker],
"rmm_memory_text": rmm_memory_text,
}

self.memory_figure.x_range.end = memory_max

update(self.source, result)


color_i = self._memory_color(gpu_used_worker, gpu_total_worker, ws.status)

width += [
rmm_used_worker,
rmm_total_worker - rmm_used_worker,
gpu_used_worker - rmm_total_worker,
spilled_worker,
]
x += [sum(width[-4:i]) + width[i] / 2 for i in range(-4, 0)]
color += [color_i, color_i, color_i, "grey"]

# memory info
rmm_used.append(rmm_used_worker)
rmm_total.append(rmm_total_worker)
gpu_used.append(gpu_used_worker)
gpu_total.append(gpu_total_worker)
spilled.append(spilled_worker)

title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nGPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
if sum(spilled):
title += f" + {format_bytes(sum(spilled))} spilled to CPU"

Check warning on line 184 in distributed/dashboard/components/rmm.py

View check run for this annotation

Codecov / codecov/patch

distributed/dashboard/components/rmm.py#L184

Added line #L184 was not covered by tests
self.root.title.text = title

result = {
"width": width,
"x": x,
"y": quadlist(range(len(workers))),
"color": color,
"alpha": [1, 0.7, 0.4, 1] * len(workers),
"worker": quadlist(ws.address for ws in workers),
"escaped_worker": quadlist(escape.url_escape(ws.address) for ws in workers),
"rmm_used": quadlist(rmm_used),
"rmm_total": quadlist(rmm_total),
"gpu_used": quadlist(gpu_used),
"gpu_total": quadlist(gpu_total),
"spilled": quadlist(spilled),
}

self.root.x_range.end = max_limit
update(self.source, result)


@log_errors
def rmm_memory_doc(scheduler, extra, doc):
with log_errors():
rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
rmm_load.update()
add_periodic_callback(doc, rmm_load, 100)
doc.add_root(rmm_load.memory_figure)
doc.theme = BOKEH_THEME
rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
rmm_load.update()
add_periodic_callback(doc, rmm_load, 100)
doc.add_root(rmm_load.root)
doc.theme = BOKEH_THEME
Loading
Loading