From 7fc10ce037a459a32dea51298a153d2ef99cd0df Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 30 Aug 2023 07:40:32 -0700
Subject: [PATCH 01/15] Initial exposure of cuDF logging information

---
 distributed/diagnostics/cudf.py | 26 ++++++++++++++++++++++++++
 distributed/worker.py           | 16 +++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 distributed/diagnostics/cudf.py

diff --git a/distributed/diagnostics/cudf.py b/distributed/diagnostics/cudf.py
new file mode 100644
index 0000000000..01118fb0c8
--- /dev/null
+++ b/distributed/diagnostics/cudf.py
@@ -0,0 +1,26 @@
+"""
+Diagnostics for memory spilling managed by cuDF.
+"""
+
+from __future__ import annotations
+
+try:
+    from cudf.core.buffer.spill_manager import get_global_manager
+except ImportError:
+    get_global_manager = None
+
+
+def real_time():
+    if get_global_manager is None:
+        return {}
+    mgr = get_global_manager()
+    if mgr is None:
+        return {}
+    keys = {
+        "gpu-to-cpu": {"nbytes": 0, "time": 0},
+        "cpu-to-gpu": {"nbytes": 0, "time": 0},
+    }
+    for (src, dst), (nbytes, time) in mgr.statistics.spill_totals.items():
+        keys[f"{src}-to-{dst}"]["nbytes"] = nbytes
+        keys[f"{src}-to-{dst}"]["time"] = time
+    return keys
diff --git a/distributed/worker.py b/distributed/worker.py
index a6acace93f..17769f6fa0 100644
--- a/distributed/worker.py
+++ b/distributed/worker.py
@@ -76,7 +76,7 @@
 )
 from distributed.core import rpc as RPCType
 from distributed.core import send_recv
-from distributed.diagnostics import nvml, rmm
+from distributed.diagnostics import cudf, nvml, rmm
 from distributed.diagnostics.plugin import _get_plugin_name
 from distributed.diskutils import WorkSpace
 from distributed.exceptions import Reschedule
@@ -3217,6 +3217,20 @@ async def rmm_metric(worker):
     del _rmm
 
 
+try:
+    import cudf as _cudf
+except Exception:
+    pass
+else:
+
+    async def cudf_metric(worker):
+        result = await offload(cudf.real_time)
+        return result
+
+    DEFAULT_METRICS["cudf"] = cudf_metric
+    del _cudf
+
+
 def print(
     *args,
     sep: str | None = " ",

From 04137ef9a34ef161756e85cbb4309a25dbb3158e Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 31 Aug 2023 09:45:48 -0700
Subject: [PATCH 02/15] Initial plot of GPU to CPU nbytes

---
 distributed/dashboard/components/cudf.py | 112 +++++++++++++++++++++++
 distributed/dashboard/scheduler.py       |   2 +
 2 files changed, 114 insertions(+)
 create mode 100644 distributed/dashboard/components/cudf.py

diff --git a/distributed/dashboard/components/cudf.py b/distributed/dashboard/components/cudf.py
new file mode 100644
index 0000000000..8f31f17895
--- /dev/null
+++ b/distributed/dashboard/components/cudf.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+import math
+
+from bokeh.core.properties import without_property_validation
+from bokeh.models import BasicTicker, ColumnDataSource, NumeralTickFormatter
+from bokeh.plotting import figure
+
+from distributed.dashboard.components import DashboardComponent, add_periodic_callback
+from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
+from distributed.dashboard.utils import update
+from distributed.utils import log_errors
+
+
+class CudfSpillingStatistics(DashboardComponent):
+    """
+    Plot giving an overview of per-worker GPU spilling statistics, including the number
+    of bytes spilled to/from CPU and the time spent spilling.
+    """
+
+    def __init__(self, scheduler, width=600, **kwargs):
+        with log_errors():
+            self.last = 0
+            self.scheduler = scheduler
+            self.source = ColumnDataSource(
+                {
+                    "from-gpu": [1, 2],
+                    "from-gpu-half": [0.5, 1],
+                    "worker": ["a", "b"],
+                    "gpu-index": [0, 0],
+                    "y": [1, 2],
+                }
+            )
+
+            bytes_spilled = figure(
+                title="Bytes spilled from GPU",
+                tools="",
+                width=int(width / 2),
+                name="bytes_spilled_histogram",
+                **kwargs,
+            )
+
+            rect = bytes_spilled.rect(
+                source=self.source,
+                x="from-gpu-half",
+                y="y",
+                width="from-gpu",
+                height=1,
+                color="#76B900",
+                alpha=1.0,
+            )
+            rect.nonselection_glyph = None
+
+            bytes_spilled.axis[0].ticker = BasicTicker(**TICKS_1024)
+            bytes_spilled.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
+            bytes_spilled.xaxis.major_label_orientation = -math.pi / 12
+            bytes_spilled.x_range.start = 0
+
+            for fig in [bytes_spilled]:
+                fig.xaxis.minor_tick_line_alpha = 0
+                fig.yaxis.visible = False
+                fig.ygrid.visible = False
+
+                fig.toolbar_location = None
+                fig.yaxis.visible = False
+
+            self.bytes_spilled_figure = bytes_spilled
+
+    @without_property_validation
+    def update(self):
+        with log_errors():
+            workers = list(self.scheduler.workers.values())
+            from_gpu = []
+            gpu_index = []
+            y = []
+            worker = []
+            memory_max = 0
+
+            for idx, ws in enumerate(workers):
+                try:
+                    cudf_metrics = ws.metrics["cudf"]
+                    gpu_info = ws.extra["gpu"]
+                except KeyError:
+                    continue
+
+                from_gpu.append(cudf_metrics["gpu-to-cpu"]["nbytes"])
+                worker.append(ws.address)
+                gpu_index.append(idx)
+                y.append(idx)
+
+                memory_max = max(memory_max, gpu_info["memory-total"])
+
+            result = {
+                "from-gpu": from_gpu,
+                "from-gpu-half": [m // 2 for m in from_gpu],
+                "worker": worker,
+                "gpu-index": gpu_index,
+                "y": y,
+            }
+
+            self.bytes_spilled_figure.x_range.end = memory_max
+
+            update(self.source, result)
+
+
+def cudf_spilling_doc(scheduler, extra, doc):
+    with log_errors():
+        cudf_spilling = CudfSpillingStatistics(scheduler, sizing_mode="stretch_both")
+        cudf_spilling.update()
+        add_periodic_callback(doc, cudf_spilling, 100)
+        doc.add_root(cudf_spilling.bytes_spilled_figure)
+        doc.theme = BOKEH_THEME
diff --git a/distributed/dashboard/scheduler.py b/distributed/dashboard/scheduler.py
index 4fbe8b24b1..c9e8ed9982 100644
--- a/distributed/dashboard/scheduler.py
+++ b/distributed/dashboard/scheduler.py
@@ -6,6 +6,7 @@
 from tornado import web
 from tornado.ioloop import IOLoop
 
+from distributed.dashboard.components.cudf import cudf_spilling_doc
 from distributed.dashboard.components.nvml import (
     gpu_doc,
     gpu_memory_doc,
@@ -119,6 +120,7 @@
     "/individual-gpu-memory": gpu_memory_doc,
     "/individual-gpu-utilization": gpu_utilization_doc,
     "/individual-rmm-memory": rmm_memory_doc,
+    "/individual-cudf-spilling": cudf_spilling_doc,
 }
 
 

From d38de06c51d36220ec6b1e905d0974d701d90f21 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 1 Sep 2023 11:41:40 -0700
Subject: [PATCH 03/15] Refactor RMM plot to include spilled memory

---
 distributed/dashboard/components/rmm.py | 346 ++++++++++++------------
 distributed/diagnostics/cudf.py         |  17 +-
 2 files changed, 179 insertions(+), 184 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index b0118a5582..b53c1ab0de 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-import math
-from textwrap import dedent
+from collections.abc import Iterable
+from typing import TypeVar
 
 from bokeh.core.properties import without_property_validation
 from bokeh.models import (
@@ -10,6 +10,7 @@
     HoverTool,
     NumeralTickFormatter,
     OpenURL,
+    Range1d,
     TapTool,
 )
 from bokeh.plotting import figure
@@ -18,191 +19,186 @@
 from dask.utils import format_bytes
 
 from distributed.dashboard.components import DashboardComponent, add_periodic_callback
-from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
+from distributed.dashboard.components.scheduler import (
+    BOKEH_THEME,
+    TICKS_1024,
+    XLABEL_ORIENTATION,
+    MemoryColor,
+)
 from distributed.dashboard.utils import update
 from distributed.utils import log_errors
 
+T = TypeVar("T")
+
 
-class RMMMemoryUsage(DashboardComponent):
+class RMMMemoryUsage(DashboardComponent, MemoryColor):
     """
     GPU memory usage plot that includes information about memory
     managed by RMM. If an RMM pool is being used, shows the amount of
     pool memory utilized.
     """
 
+    @log_errors
     def __init__(self, scheduler, width=600, **kwargs):
-        with log_errors():
-            self.last = 0
-            self.scheduler = scheduler
-            self.source = ColumnDataSource(
-                {
-                    "rmm-used": [1, 2],
-                    "rmm-used-half": [0.5, 1],
-                    "rmm-total": [2, 4],
-                    "rmm-total-half": [1, 2],
-                    "external-used": [2, 1],
-                    "external-used-x": [3, 4.5],
-                    "worker": ["a", "b"],
-                    "gpu-index": [0, 0],
-                    "y": [1, 2],
-                    "escaped_worker": ["a", "b"],
-                    "rmm_memory_text": [
-                        "RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
-                        "RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
-                    ],
-                }
-            )
-
-            memory = figure(
-                title="RMM Memory",
-                tools="",
-                width=int(width / 2),
-                name="rmm_memory_histogram",
-                **kwargs,
-            )
-
-            rect = memory.rect(
-                source=self.source,
-                x="rmm-used-half",
-                y="y",
-                width="rmm-used",
-                height=1,
-                color="#76B900",
-                alpha=1.0,
-            )
-            rect.nonselection_glyph = None
-
-            rect = memory.rect(
-                source=self.source,
-                x="rmm-total-half",
-                y="y",
-                width="rmm-total",
-                height=1,
-                color="#76B900",
-                alpha=0.75,
-            )
-            rect.nonselection_glyph = None
-
-            rect = memory.rect(
-                source=self.source,
-                x="external-used-x",
-                y="y",
-                width="external-used",
-                height=1,
-                color="#76B900",
-                alpha=0.5,
-            )
-            rect.nonselection_glyph = None
-
-            memory.axis[0].ticker = BasicTicker(**TICKS_1024)
-            memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
-            memory.xaxis.major_label_orientation = -math.pi / 12
-            memory.x_range.start = 0
-
-            for fig in [memory]:
-                fig.xaxis.minor_tick_line_alpha = 0
-                fig.yaxis.visible = False
-                fig.ygrid.visible = False
-
-                tap = TapTool(
-                    callback=OpenURL(url="./info/worker/@escaped_worker.html")
-                )
-                fig.add_tools(tap)
-
-                fig.toolbar_location = None
-                fig.yaxis.visible = False
-
-            hover = HoverTool()
-            hover.tooltips = "@worker : @rmm_memory_text"
-            hover.point_policy = "follow_mouse"
-            memory.add_tools(hover)
-
-            self.memory_figure = memory
+        DashboardComponent.__init__(self)
+        MemoryColor.__init__(self)
+
+        self.last = 0
+        self.scheduler = scheduler
+        self.source = ColumnDataSource(
+            {
+                "width": [],
+                "x": [],
+                "y": [],
+                "color": [],
+                "alpha": [],
+                "worker": [],
+                "escaped_worker": [],
+                "rmm_used": [],
+                "rmm_total": [],
+                "gpu_used": [],
+                "gpu_total": [],
+                "spilled": [],
+            }
+        )
+
+        self.root = figure(
+            title="RMM memory used",
+            tools="",
+            width=int(width / 2),
+            name="rmm_memory",
+            **kwargs,
+        )
+        rect = self.root.rect(
+            source=self.source,
+            x="x",
+            y="y",
+            width="width",
+            height=0.9,
+            color="color",
+            fill_alpha="alpha",
+            line_width=0,
+        )
+        rect.nonselection_glyph = None
+
+        self.root.axis[0].ticker = BasicTicker(**TICKS_1024)
+        self.root.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
+        self.root.xaxis.major_label_orientation = XLABEL_ORIENTATION
+        self.root.xaxis.minor_tick_line_alpha = 0
+        self.root.x_range = Range1d(start=0)
+        self.root.yaxis.visible = False
+        self.root.ygrid.visible = False
+        self.root.toolbar_location = None
+
+        tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
+        self.root.add_tools(tap)
+
+        hover = HoverTool(
+            point_policy="follow_mouse",
+            tooltips="""
+            <div>
+                <span style="font-size: 12px; font-weight: bold;">Worker:</span>&nbsp;
+                <span style="font-size: 10px; font-family: Monaco, monospace;">@worker</span>
+            </div>
+            <div>
+                <span style="font-size: 12px; font-weight: bold;">RMM memory used:</span>&nbsp;
+                <span style="font-size: 10px; font-family: Monaco, monospace;">@rmm_used{0.00 b} / @rmm_total{0.00 b}</span>
+            </div>
+            <div>
+                <span style="font-size: 12px; font-weight: bold;">Total GPU memory used:</span>&nbsp;
+                <span style="font-size: 10px; font-family: Monaco, monospace;">@gpu_used{0.00 b} / @gpu_total{0.00 b}</span>
+            </div>
+            <div>
+                <span style="font-size: 12px; font-weight: bold;">Spilled to CPU:</span>&nbsp;
+                <span style="font-size: 10px; font-family: Monaco, monospace;">@spilled{0.00 b}</span>
+            </div>
+            """,
+        )
+        self.root.add_tools(hover)
 
     @without_property_validation
+    @log_errors
     def update(self):
-        with log_errors():
-            workers = list(self.scheduler.workers.values())
-            rmm_total = []
-            rmm_used = []
-            external_used = []
-            gpu_index = []
-            y = []
-            worker = []
-            external_used_x = []
-            memory_max = 0
-            gpu_total = []
-            rmm_memory_text = []
-
-            for idx, ws in enumerate(workers):
-                try:
-                    rmm_metrics = ws.metrics["rmm"]
-                    gpu_metrics = ws.metrics["gpu"]
-                    gpu_info = ws.extra["gpu"]
-                except KeyError:
-                    continue
-                rmm_total_worker = rmm_metrics["rmm-total"]  # RMM memory only
-                rmm_used_worker = rmm_metrics["rmm-used"]
-                gpu_total_worker = gpu_info["memory-total"]  # All GPU memory
-                gpu_used_worker = gpu_metrics["memory-used"]
-
-                external_used_worker = gpu_used_worker - rmm_total_worker
-
-                rmm_total.append(rmm_total_worker)
-                rmm_used.append(rmm_used_worker)
-                gpu_total.append(gpu_total_worker)
-                external_used.append(external_used_worker)
-                external_used_x.append(rmm_total_worker + external_used_worker / 2)
-                worker.append(ws.address)
-                gpu_index.append(idx)
-                y.append(idx)
-
-                memory_max = max(memory_max, gpu_total_worker)
-
-                rmm_memory_text.append(
-                    "RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format(
-                        format_bytes(rmm_used_worker),
-                        format_bytes(rmm_total_worker),
-                        format_bytes(gpu_used_worker),
-                        format_bytes(gpu_total_worker),
-                    )
-                )
-
-            self.memory_figure.title.text = dedent(
-                """\
-                RMM Utilization: {} / {}
-                GPU Memory: {} / {}
-                """.format(
-                    format_bytes(sum(rmm_used)),
-                    format_bytes(sum(rmm_total)),
-                    format_bytes(sum([*rmm_total, *external_used])),
-                    format_bytes(sum(gpu_total)),
-                )
-            )
-
-            result = {
-                "rmm-total": rmm_total,
-                "rmm-used": rmm_used,
-                "external-used": external_used,
-                "rmm-total-half": [m // 2 for m in rmm_total],
-                "rmm-used-half": [m // 2 for m in rmm_used],
-                "external-used-x": external_used_x,
-                "worker": worker,
-                "gpu-index": gpu_index,
-                "y": y,
-                "escaped_worker": [escape.url_escape(w) for w in worker],
-                "rmm_memory_text": rmm_memory_text,
-            }
-
-            self.memory_figure.x_range.end = memory_max
-
-            update(self.source, result)
-
-
+        def quadlist(i: Iterable[T]) -> list[T]:
+            out = []
+            for ii in i:
+                out += [ii, ii, ii, ii]
+            return out
+
+        workers = list(self.scheduler.workers.values())
+
+        width = []
+        x = []
+        color = []
+        max_limit = 0
+        rmm_used = []
+        rmm_total = []
+        gpu_used = []
+        gpu_total = []
+        spilled = []
+
+        for ws in workers:
+            try:
+                rmm_metrics = ws.metrics["rmm"]
+                gpu_metrics = ws.metrics["gpu"]
+                gpu_info = ws.extra["gpu"]
+                cudf_metrics = ws.metrics["cudf"]
+            except KeyError:
+                continue
+
+            rmm_used_worker = rmm_metrics["rmm-used"]  # RMM memory only
+            rmm_total_worker = rmm_metrics["rmm-total"]
+            gpu_used_worker = gpu_metrics["memory-used"]  # All GPU memory
+            gpu_total_worker = gpu_info["memory-total"]
+            spilled_worker = cudf_metrics["cudf-spilled"]  # memory spilled to host
+
+            max_limit = max(max_limit, gpu_total_worker + spilled_worker)
+            color_i = self._memory_color(gpu_used_worker, gpu_total_worker, ws.status)
+
+            width += [
+                rmm_used_worker,
+                rmm_total_worker - rmm_used_worker,
+                gpu_used_worker - rmm_total_worker,
+                spilled_worker,
+            ]
+            x += [sum(width[-4:i]) + width[i] / 2 for i in range(-4, 0)]
+            color += [color_i, color_i, color_i, "grey"]
+
+            # memory info
+            rmm_used.append(rmm_used_worker)
+            rmm_total.append(rmm_total_worker)
+            gpu_used.append(gpu_used_worker)
+            gpu_total.append(gpu_total_worker)
+            spilled.append(spilled_worker)
+
+        title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nTotal GPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
+        if sum(spilled):
+            title += f" + {format_bytes(sum(spilled))} spilled to CPU"
+        self.root.title.text = title
+
+        result = {
+            "width": width,
+            "x": x,
+            "y": quadlist(range(len(workers))),
+            "color": color,
+            "alpha": [1, 0.7, 0.4, 1] * len(workers),
+            "worker": quadlist(ws.address for ws in workers),
+            "escaped_worker": quadlist(escape.url_escape(ws.address) for ws in workers),
+            "rmm_used": quadlist(rmm_used),
+            "rmm_total": quadlist(rmm_total),
+            "gpu_used": quadlist(gpu_used),
+            "gpu_total": quadlist(gpu_total),
+            "spilled": quadlist(spilled),
+        }
+
+        self.root.x_range.end = max_limit
+        update(self.source, result)
+
+
+@log_errors
 def rmm_memory_doc(scheduler, extra, doc):
-    with log_errors():
-        rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
-        rmm_load.update()
-        add_periodic_callback(doc, rmm_load, 100)
-        doc.add_root(rmm_load.memory_figure)
-        doc.theme = BOKEH_THEME
+    rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
+    rmm_load.update()
+    add_periodic_callback(doc, rmm_load, 100)
+    doc.add_root(rmm_load.root)
+    doc.theme = BOKEH_THEME
diff --git a/distributed/diagnostics/cudf.py b/distributed/diagnostics/cudf.py
index 01118fb0c8..a3404ee490 100644
--- a/distributed/diagnostics/cudf.py
+++ b/distributed/diagnostics/cudf.py
@@ -12,15 +12,14 @@
 
 def real_time():
     if get_global_manager is None:
-        return {}
+        return {"cudf-spilled": 0}
     mgr = get_global_manager()
     if mgr is None:
-        return {}
-    keys = {
-        "gpu-to-cpu": {"nbytes": 0, "time": 0},
-        "cpu-to-gpu": {"nbytes": 0, "time": 0},
+        return {"cudf-spilled": 0}
+
+    totals = mgr.statistics.spill_totals
+
+    return {
+        "cudf-spilled": totals.get(("gpu", "cpu"), (0,))[0]
+        - totals.get(("cpu", "gpu"), (0,))[0]
     }
-    for (src, dst), (nbytes, time) in mgr.statistics.spill_totals.items():
-        keys[f"{src}-to-{dst}"]["nbytes"] = nbytes
-        keys[f"{src}-to-{dst}"]["time"] = time
-    return keys

From eeddf1e4af209cc0881119d1c34b132877e0dc1a Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 1 Sep 2023 12:05:16 -0700
Subject: [PATCH 04/15] Fix memory limit on x axis

---
 distributed/dashboard/components/cudf.py | 112 -----------------------
 distributed/dashboard/components/rmm.py  |   8 +-
 2 files changed, 5 insertions(+), 115 deletions(-)
 delete mode 100644 distributed/dashboard/components/cudf.py

diff --git a/distributed/dashboard/components/cudf.py b/distributed/dashboard/components/cudf.py
deleted file mode 100644
index 8f31f17895..0000000000
--- a/distributed/dashboard/components/cudf.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from __future__ import annotations
-
-import math
-
-from bokeh.core.properties import without_property_validation
-from bokeh.models import BasicTicker, ColumnDataSource, NumeralTickFormatter
-from bokeh.plotting import figure
-
-from distributed.dashboard.components import DashboardComponent, add_periodic_callback
-from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
-from distributed.dashboard.utils import update
-from distributed.utils import log_errors
-
-
-class CudfSpillingStatistics(DashboardComponent):
-    """
-    Plot giving an overview of per-worker GPU spilling statistics, including the number
-    of bytes spilled to/from CPU and the time spent spilling.
-    """
-
-    def __init__(self, scheduler, width=600, **kwargs):
-        with log_errors():
-            self.last = 0
-            self.scheduler = scheduler
-            self.source = ColumnDataSource(
-                {
-                    "from-gpu": [1, 2],
-                    "from-gpu-half": [0.5, 1],
-                    "worker": ["a", "b"],
-                    "gpu-index": [0, 0],
-                    "y": [1, 2],
-                }
-            )
-
-            bytes_spilled = figure(
-                title="Bytes spilled from GPU",
-                tools="",
-                width=int(width / 2),
-                name="bytes_spilled_histogram",
-                **kwargs,
-            )
-
-            rect = bytes_spilled.rect(
-                source=self.source,
-                x="from-gpu-half",
-                y="y",
-                width="from-gpu",
-                height=1,
-                color="#76B900",
-                alpha=1.0,
-            )
-            rect.nonselection_glyph = None
-
-            bytes_spilled.axis[0].ticker = BasicTicker(**TICKS_1024)
-            bytes_spilled.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
-            bytes_spilled.xaxis.major_label_orientation = -math.pi / 12
-            bytes_spilled.x_range.start = 0
-
-            for fig in [bytes_spilled]:
-                fig.xaxis.minor_tick_line_alpha = 0
-                fig.yaxis.visible = False
-                fig.ygrid.visible = False
-
-                fig.toolbar_location = None
-                fig.yaxis.visible = False
-
-            self.bytes_spilled_figure = bytes_spilled
-
-    @without_property_validation
-    def update(self):
-        with log_errors():
-            workers = list(self.scheduler.workers.values())
-            from_gpu = []
-            gpu_index = []
-            y = []
-            worker = []
-            memory_max = 0
-
-            for idx, ws in enumerate(workers):
-                try:
-                    cudf_metrics = ws.metrics["cudf"]
-                    gpu_info = ws.extra["gpu"]
-                except KeyError:
-                    continue
-
-                from_gpu.append(cudf_metrics["gpu-to-cpu"]["nbytes"])
-                worker.append(ws.address)
-                gpu_index.append(idx)
-                y.append(idx)
-
-                memory_max = max(memory_max, gpu_info["memory-total"])
-
-            result = {
-                "from-gpu": from_gpu,
-                "from-gpu-half": [m // 2 for m in from_gpu],
-                "worker": worker,
-                "gpu-index": gpu_index,
-                "y": y,
-            }
-
-            self.bytes_spilled_figure.x_range.end = memory_max
-
-            update(self.source, result)
-
-
-def cudf_spilling_doc(scheduler, extra, doc):
-    with log_errors():
-        cudf_spilling = CudfSpillingStatistics(scheduler, sizing_mode="stretch_both")
-        cudf_spilling.update()
-        add_periodic_callback(doc, cudf_spilling, 100)
-        doc.add_root(cudf_spilling.bytes_spilled_figure)
-        doc.theme = BOKEH_THEME
diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index b53c1ab0de..628885a4a7 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -105,7 +105,7 @@ def __init__(self, scheduler, width=600, **kwargs):
                 <span style="font-size: 10px; font-family: Monaco, monospace;">@rmm_used{0.00 b} / @rmm_total{0.00 b}</span>
             </div>
             <div>
-                <span style="font-size: 12px; font-weight: bold;">Total GPU memory used:</span>&nbsp;
+                <span style="font-size: 12px; font-weight: bold;">GPU memory used:</span>&nbsp;
                 <span style="font-size: 10px; font-family: Monaco, monospace;">@gpu_used{0.00 b} / @gpu_total{0.00 b}</span>
             </div>
             <div>
@@ -152,7 +152,9 @@ def quadlist(i: Iterable[T]) -> list[T]:
             gpu_total_worker = gpu_info["memory-total"]
             spilled_worker = cudf_metrics["cudf-spilled"]  # memory spilled to host
 
-            max_limit = max(max_limit, gpu_total_worker + spilled_worker)
+            max_limit = max(
+                max_limit, gpu_total_worker, gpu_used_worker + spilled_worker
+            )
             color_i = self._memory_color(gpu_used_worker, gpu_total_worker, ws.status)
 
             width += [
@@ -171,7 +173,7 @@ def quadlist(i: Iterable[T]) -> list[T]:
             gpu_total.append(gpu_total_worker)
             spilled.append(spilled_worker)
 
-        title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nTotal GPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
+        title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nGPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
         if sum(spilled):
             title += f" + {format_bytes(sum(spilled))} spilled to CPU"
         self.root.title.text = title

From 043835cea443674fea893664655c7d943d41c3e5 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 1 Sep 2023 12:08:28 -0700
Subject: [PATCH 05/15] Remove unused dashboard plot

---
 distributed/dashboard/scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/distributed/dashboard/scheduler.py b/distributed/dashboard/scheduler.py
index c9e8ed9982..4fbe8b24b1 100644
--- a/distributed/dashboard/scheduler.py
+++ b/distributed/dashboard/scheduler.py
@@ -6,7 +6,6 @@
 from tornado import web
 from tornado.ioloop import IOLoop
 
-from distributed.dashboard.components.cudf import cudf_spilling_doc
 from distributed.dashboard.components.nvml import (
     gpu_doc,
     gpu_memory_doc,
@@ -120,7 +119,6 @@
     "/individual-gpu-memory": gpu_memory_doc,
     "/individual-gpu-utilization": gpu_utilization_doc,
     "/individual-rmm-memory": rmm_memory_doc,
-    "/individual-cudf-spilling": cudf_spilling_doc,
 }
 
 

From 0ac3344f1bcd4323325380075bed0ab88fc0aca3 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 6 Sep 2023 08:25:28 -0700
Subject: [PATCH 06/15] Allow MemoryColor colors to be overridden

---
 distributed/dashboard/components/rmm.py       |  2 +-
 distributed/dashboard/components/scheduler.py | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index 628885a4a7..9f258b787c 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -41,7 +41,7 @@ class RMMMemoryUsage(DashboardComponent, MemoryColor):
     @log_errors
     def __init__(self, scheduler, width=600, **kwargs):
         DashboardComponent.__init__(self)
-        MemoryColor.__init__(self)
+        MemoryColor.__init__(self, neutral_color="#76B900")
 
         self.last = 0
         self.scheduler = scheduler
diff --git a/distributed/dashboard/components/scheduler.py b/distributed/dashboard/components/scheduler.py
index d25c56f65e..02d563659d 100644
--- a/distributed/dashboard/components/scheduler.py
+++ b/distributed/dashboard/components/scheduler.py
@@ -276,10 +276,15 @@ class MemoryColor:
     orange: float
     red: float
 
-    def __init__(self):
+    def __init__(self, neutral_color="blue", target_color="orange", terminated_color="red"):
+        self.neutral_color = neutral_color
+        self.target_color = target_color
+        self.terminated_color = terminated_color
+
         target = dask.config.get("distributed.worker.memory.target")
         spill = dask.config.get("distributed.worker.memory.spill")
         terminate = dask.config.get("distributed.worker.memory.terminate")
+
         # These values can be False. It's also common to configure them to impossibly
         # high values to achieve the same effect.
         self.orange = min(target or math.inf, spill or math.inf)
@@ -287,14 +292,14 @@ def __init__(self):
 
     def _memory_color(self, current: int, limit: int, status: Status) -> str:
         if status != Status.running:
-            return "red"
+            return self.terminated_color
         if not limit:
-            return "blue"
+            return self.neutral_color
         if current >= limit * self.red:
-            return "red"
+            return self.terminated_color
         if current >= limit * self.orange:
-            return "orange"
-        return "blue"
+            return self.target_color
+        return self.neutral_color
 
 
 class ClusterMemory(DashboardComponent, MemoryColor):

From bb491356a739dbbab85882bf0954309ca0ec68f8 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 6 Sep 2023 08:40:39 -0700
Subject: [PATCH 07/15] Linting

---
 distributed/dashboard/components/scheduler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/distributed/dashboard/components/scheduler.py b/distributed/dashboard/components/scheduler.py
index 02d563659d..4d4c736ff6 100644
--- a/distributed/dashboard/components/scheduler.py
+++ b/distributed/dashboard/components/scheduler.py
@@ -276,7 +276,9 @@ class MemoryColor:
     orange: float
     red: float
 
-    def __init__(self, neutral_color="blue", target_color="orange", terminated_color="red"):
+    def __init__(
+        self, neutral_color="blue", target_color="orange", terminated_color="red"
+    ):
         self.neutral_color = neutral_color
         self.target_color = target_color
         self.terminated_color = terminated_color

From 50e626aeb78f8ff90dd1c4d1cd2fcac64c109059 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 7 Sep 2023 09:09:33 -0700
Subject: [PATCH 08/15] Add cudf diagnostics test

---
 continuous_integration/gpuci/build.sh         |  4 ++
 .../tests/test_cudf_diagnostics.py            | 39 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 distributed/diagnostics/tests/test_cudf_diagnostics.py

diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh
index 37b7ab4370..76048eddd2 100644
--- a/continuous_integration/gpuci/build.sh
+++ b/continuous_integration/gpuci/build.sh
@@ -26,6 +26,10 @@ export CUDA_REL=${CUDA_VERSION%.*}
 # FIXME - monitoring GIL contention causes UCX teardown issues
 export DASK_DISTRIBUTED__ADMIN__SYSTEM_MONITOR__GIL__ENABLED=False
 
+# enable cuDF spilling to host
+export CUDF_SPILL=on
+export CUDF_SPILL_STATS=1
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
diff --git a/distributed/diagnostics/tests/test_cudf_diagnostics.py b/distributed/diagnostics/tests/test_cudf_diagnostics.py
new file mode 100644
index 0000000000..9c2bbb825a
--- /dev/null
+++ b/distributed/diagnostics/tests/test_cudf_diagnostics.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import pytest
+
+from distributed.utils_test import gen_cluster
+
+pytestmark = pytest.mark.gpu
+
+cudf = pytest.importorskip("cudf")
+dask_cuda = pytest.importorskip("dask_cuda")
+
+
+def force_spill():
+    from cudf.core.buffer.spill_manager import get_global_manager
+
+    manager = get_global_manager()
+
+    # 24 bytes
+    df = cudf.DataFrame({"a": [1, 2, 3]})
+
+    return manager.spill_to_device_limit(1)
+
+
+@gen_cluster(
+    client=True,
+    nthreads=[("127.0.0.1", 1)],
+    Worker=dask_cuda.CUDAWorker,
+)
+async def test_cudf_metrics(c, s, *workers):
+    w = list(s.workers.values())[0]
+    assert "cudf" in w.metrics
+    assert w.metrics["cudf"]["cudf-spilled"] == 0
+
+    try:
+        await c.run(force_spill)
+    except AttributeError:
+        pytest.xfail("cuDF spilling & spilling statistics must be enabled")
+
+    assert w.metrics["cudf"]["cudf-spilled"] == 24

From 082ddefca3cb6e6c9e80eea15277d2805bca2d93 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 12 Sep 2023 09:51:00 -0700
Subject: [PATCH 09/15] Resolve bokeh test failures

---
 distributed/dashboard/components/rmm.py | 10 ++++++++--
 distributed/diagnostics/cudf.py         |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index 9f258b787c..7376476570 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -142,15 +142,21 @@ def quadlist(i: Iterable[T]) -> list[T]:
                 rmm_metrics = ws.metrics["rmm"]
                 gpu_metrics = ws.metrics["gpu"]
                 gpu_info = ws.extra["gpu"]
+            except KeyError:
+                rmm_metrics = {"rmm-used": 0, "rmm-total": 0}
+                gpu_metrics = {"memory-used": 0}
+                gpu_info = {"memory-total": 0}
+
+            try:
                 cudf_metrics = ws.metrics["cudf"]
             except KeyError:
-                continue
+                cudf_metrics = {"cudf-spilled": 0}
 
             rmm_used_worker = rmm_metrics["rmm-used"]  # RMM memory only
             rmm_total_worker = rmm_metrics["rmm-total"]
             gpu_used_worker = gpu_metrics["memory-used"]  # All GPU memory
             gpu_total_worker = gpu_info["memory-total"]
-            spilled_worker = cudf_metrics["cudf-spilled"]  # memory spilled to host
+            spilled_worker = cudf_metrics["cudf-spilled"] or 0  # memory spilled to host
 
             max_limit = max(
                 max_limit, gpu_total_worker, gpu_used_worker + spilled_worker
diff --git a/distributed/diagnostics/cudf.py b/distributed/diagnostics/cudf.py
index a3404ee490..c118f7e503 100644
--- a/distributed/diagnostics/cudf.py
+++ b/distributed/diagnostics/cudf.py
@@ -12,10 +12,10 @@
 
 def real_time():
     if get_global_manager is None:
-        return {"cudf-spilled": 0}
+        return {"cudf-spilled": None}
     mgr = get_global_manager()
     if mgr is None:
-        return {"cudf-spilled": 0}
+        return {"cudf-spilled": None}
 
     totals = mgr.statistics.spill_totals
 

From b60173d229b92985c1b9ee6e83f2d52c267ab994 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 28 Sep 2023 07:47:39 -0700
Subject: [PATCH 10/15] Make cudf spilling monitoring optional and disabled by
 default

---
 distributed/distributed-schema.yaml |  8 +++++++-
 distributed/distributed.yaml        |  1 +
 distributed/worker.py               | 26 ++++++++++++++------------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/distributed/distributed-schema.yaml b/distributed/distributed-schema.yaml
index 510db30123..6985a39574 100644
--- a/distributed/distributed-schema.yaml
+++ b/distributed/distributed-schema.yaml
@@ -1001,6 +1001,12 @@ properties:
               not a problem and will be automatically disabled if no GPUs are found in the
               system, but in certain cases it may be desirable to completely disable NVML
               diagnostics.
+          cudf:
+            type: boolean
+            description: |
+              If ``True``, enables tracking of GPU spilling and unspilling managed by cuDF (if it is enabled).
+              Note that this forces a cuDF import at worker startup, which may be undesirable for performance
+              and memory footprint.
           computations:
             type: object
             properties:
@@ -1008,7 +1014,7 @@ properties:
                 type: integer
                 minimum: 0
                 description: |
-                  The maximum number of Computations to remember.
+                  The maximum number of computations to remember.
               nframes:
                 type: integer
                 minimum: 0
diff --git a/distributed/distributed.yaml b/distributed/distributed.yaml
index 594ee22532..874735a6f0 100644
--- a/distributed/distributed.yaml
+++ b/distributed/distributed.yaml
@@ -267,6 +267,7 @@ distributed:
 
   diagnostics:
     nvml: True
+    cudf: False
     computations:
       max-history: 100
       nframes: 0
diff --git a/distributed/worker.py b/distributed/worker.py
index 5f5ebcdab0..e56de15a86 100644
--- a/distributed/worker.py
+++ b/distributed/worker.py
@@ -78,7 +78,7 @@
 )
 from distributed.core import rpc as RPCType
 from distributed.core import send_recv
-from distributed.diagnostics import cudf, nvml, rmm
+from distributed.diagnostics import nvml, rmm
 from distributed.diagnostics.plugin import WorkerPlugin, _get_plugin_name
 from distributed.diskutils import WorkSpace
 from distributed.exceptions import Reschedule
@@ -3221,19 +3221,21 @@ async def rmm_metric(worker):
     DEFAULT_METRICS["rmm"] = rmm_metric
     del _rmm
 
+# avoid importing cuDF unless explicitly enabled
+if dask.config.get("distributed.diagnostics.cudf"):
+    try:
+        import cudf as _cudf  # noqa: F401
+    except Exception:
+        pass
+    else:
+        from distributed.diagnostics import cudf
 
-try:
-    import cudf as _cudf
-except Exception:
-    pass
-else:
-
-    async def cudf_metric(worker):
-        result = await offload(cudf.real_time)
-        return result
+        async def cudf_metric(worker):
+            result = await offload(cudf.real_time)
+            return result
 
-    DEFAULT_METRICS["cudf"] = cudf_metric
-    del _cudf
+        DEFAULT_METRICS["cudf"] = cudf_metric
+        del _cudf
 
 
 def print(

From 189013679dac92fbb9c25c0c8415106d34609bd8 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 28 Sep 2023 13:10:07 -0700
Subject: [PATCH 11/15] Modify cudf spilling test

---
 continuous_integration/gpuci/build.sh                  | 4 +++-
 distributed/diagnostics/tests/test_cudf_diagnostics.py | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh
index 76048eddd2..8b01eeb5e4 100644
--- a/continuous_integration/gpuci/build.sh
+++ b/continuous_integration/gpuci/build.sh
@@ -26,9 +26,11 @@ export CUDA_REL=${CUDA_VERSION%.*}
 # FIXME - monitoring GIL contention causes UCX teardown issues
 export DASK_DISTRIBUTED__ADMIN__SYSTEM_MONITOR__GIL__ENABLED=False
 
-# enable cuDF spilling to host
+# enable monitoring of cuDF spilling
 export CUDF_SPILL=on
 export CUDF_SPILL_STATS=1
+export DASK_DISTRIBUTED__DIAGNOSTICS__CUDF=1
+
 
 ################################################################################
 # SETUP - Check environment
diff --git a/distributed/diagnostics/tests/test_cudf_diagnostics.py b/distributed/diagnostics/tests/test_cudf_diagnostics.py
index 9c2bbb825a..319281a6c4 100644
--- a/distributed/diagnostics/tests/test_cudf_diagnostics.py
+++ b/distributed/diagnostics/tests/test_cudf_diagnostics.py
@@ -29,7 +29,11 @@ def force_spill():
 async def test_cudf_metrics(c, s, *workers):
     w = list(s.workers.values())[0]
     assert "cudf" in w.metrics
-    assert w.metrics["cudf"]["cudf-spilled"] == 0
+
+    if spill_initial := w.metrics["cudf"]["cudf-spilled"] is None:
+        pytest.xfail("cuDF spilling & spilling statistics must be enabled")
+
+    assert spill_initial == 0
 
     try:
         await c.run(force_spill)

From 5eafddcdfee1c3448db2125816118f0f64d38d52 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 29 Sep 2023 10:19:18 -0700
Subject: [PATCH 12/15] Test cuDF spill tests in separate process

---
 continuous_integration/gpuci/build.sh         |  4 ++++
 .../tests/test_cudf_diagnostics.py            | 23 +++++++++++--------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh
index 8b01eeb5e4..54e59df902 100644
--- a/continuous_integration/gpuci/build.sh
+++ b/continuous_integration/gpuci/build.sh
@@ -62,3 +62,7 @@ conda list --show-channel-urls
 
 gpuci_logger "Python py.test for distributed"
 py.test distributed -v -m gpu --runslow --junitxml="$WORKSPACE/junit-distributed.xml"
+
+# cuDF spill stats monitoring must be enabled for this test
+CUDF_SPILL=on CUDF_SPILL_STATS=1 DASK_DISTRIBUTED__DIAGNOSTICS__CUDF=1 \
+    py.test distributed/diagnostics/tests/test_cudf_diagnostics.py -v -m gpu --runslow --junitxml="$WORKSPACE/junit-distributed.xml"
diff --git a/distributed/diagnostics/tests/test_cudf_diagnostics.py b/distributed/diagnostics/tests/test_cudf_diagnostics.py
index 319281a6c4..ee9ae12f89 100644
--- a/distributed/diagnostics/tests/test_cudf_diagnostics.py
+++ b/distributed/diagnostics/tests/test_cudf_diagnostics.py
@@ -1,10 +1,20 @@
 from __future__ import annotations
 
+import os
+
 import pytest
 
 from distributed.utils_test import gen_cluster
 
-pytestmark = pytest.mark.gpu
+pytestmark = [
+    pytest.mark.gpu,
+    pytest.mark.skipif(
+        os.environ.get("CUDF_SPILL", "off") != "on"
+        or os.environ.get("CUDF_SPILL_STATS", "0") != "1"
+        or os.environ.get("DASK_DISTRIBUTED__DIAGNOSTICS__CUDF", "0") != "1",
+        reason="cuDF spill stats monitoring must be enabled manually",
+    ),
+]
 
 cudf = pytest.importorskip("cudf")
 dask_cuda = pytest.importorskip("dask_cuda")
@@ -29,15 +39,8 @@ def force_spill():
 async def test_cudf_metrics(c, s, *workers):
     w = list(s.workers.values())[0]
     assert "cudf" in w.metrics
+    assert w.metrics["cudf"]["cudf-spilled"] == 0
 
-    if spill_initial := w.metrics["cudf"]["cudf-spilled"] is None:
-        pytest.xfail("cuDF spilling & spilling statistics must be enabled")
-
-    assert spill_initial == 0
-
-    try:
-        await c.run(force_spill)
-    except AttributeError:
-        pytest.xfail("cuDF spilling & spilling statistics must be enabled")
+    await c.run(force_spill)
 
     assert w.metrics["cudf"]["cudf-spilled"] == 24

From 21e106b35e1f46697a745263d059c8c29662b092 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 2 Oct 2023 06:38:32 -0700
Subject: [PATCH 13/15] Remove global cuDF spilling settings from build.sh

---
 continuous_integration/gpuci/build.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh
index 54e59df902..ee018779bb 100644
--- a/continuous_integration/gpuci/build.sh
+++ b/continuous_integration/gpuci/build.sh
@@ -26,12 +26,6 @@ export CUDA_REL=${CUDA_VERSION%.*}
 # FIXME - monitoring GIL contention causes UCX teardown issues
 export DASK_DISTRIBUTED__ADMIN__SYSTEM_MONITOR__GIL__ENABLED=False
 
-# enable monitoring of cuDF spilling
-export CUDF_SPILL=on
-export CUDF_SPILL_STATS=1
-export DASK_DISTRIBUTED__DIAGNOSTICS__CUDF=1
-
-
 ################################################################################
 # SETUP - Check environment
 ################################################################################

From 3cc4b946ab749479b0ff920e0e6a23d87b2ab854 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 2 Oct 2023 18:21:31 -0700
Subject: [PATCH 14/15] cuDF metrics test is flaky

---
 distributed/diagnostics/tests/test_cudf_diagnostics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/distributed/diagnostics/tests/test_cudf_diagnostics.py b/distributed/diagnostics/tests/test_cudf_diagnostics.py
index ee9ae12f89..395eb901de 100644
--- a/distributed/diagnostics/tests/test_cudf_diagnostics.py
+++ b/distributed/diagnostics/tests/test_cudf_diagnostics.py
@@ -36,6 +36,7 @@ def force_spill():
     nthreads=[("127.0.0.1", 1)],
     Worker=dask_cuda.CUDAWorker,
 )
+@pytest.mark.flaky(reruns=10, reruns_delay=5)
 async def test_cudf_metrics(c, s, *workers):
     w = list(s.workers.values())[0]
     assert "cudf" in w.metrics

From b2fdfc67f634bf5e1063c22b2643e60baed2d04a Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 25 Oct 2023 08:30:11 -0700
Subject: [PATCH 15/15] Shouldn't need dask-cuda worker for test

---
 distributed/diagnostics/tests/test_cudf_diagnostics.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/distributed/diagnostics/tests/test_cudf_diagnostics.py b/distributed/diagnostics/tests/test_cudf_diagnostics.py
index 395eb901de..feb5681855 100644
--- a/distributed/diagnostics/tests/test_cudf_diagnostics.py
+++ b/distributed/diagnostics/tests/test_cudf_diagnostics.py
@@ -17,7 +17,6 @@
 ]
 
 cudf = pytest.importorskip("cudf")
-dask_cuda = pytest.importorskip("dask_cuda")
 
 
 def force_spill():
@@ -34,7 +33,6 @@ def force_spill():
 @gen_cluster(
     client=True,
     nthreads=[("127.0.0.1", 1)],
-    Worker=dask_cuda.CUDAWorker,
 )
 @pytest.mark.flaky(reruns=10, reruns_delay=5)
 async def test_cudf_metrics(c, s, *workers):