Skip to content

Commit 3c6e0a9

Browse files
authored
Update API reference and examples in docs (#561)
Addresses #554; I tried to tidy up the installation/specializations/quickstart pages so that some of the more technical stuff could go into some published examples, and probably a new page discussing configuration of LocalCUDACluster / `dask-cuda-worker`. Also added `sphinx-click` to the dependencies so we can more easily document any changes to `dask-cuda-worker` (#560) - I would also like to get the docstrings for the CLI, LocalCUDACluster, and `initiailize()` up to date and more concise if possible. Authors: - Charles Blackmon-Luca (https://github.com/charlesbluca) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) URL: #561
1 parent fde564e commit 3c6e0a9

17 files changed

+555
-484
lines changed

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ dask_cuda.egg-info/
2121
python/build
2222
python/cudf/bindings/*.cpp
2323
dask-worker-space/
24+
docs/_build/
2425

2526
## Patching
2627
*.diff

Diff for: dask_cuda/cli/dask_cuda_worker.py

+140-106
Original file line numberDiff line numberDiff line change
@@ -19,98 +19,69 @@
1919

2020
@click.command(context_settings=dict(ignore_unknown_options=True))
2121
@click.argument("scheduler", type=str, required=False)
22-
@click.option(
23-
"--tls-ca-file",
24-
type=pem_file_option_type,
25-
default=None,
26-
help="CA cert(s) file for TLS (in PEM format)",
27-
)
28-
@click.option(
29-
"--tls-cert",
30-
type=pem_file_option_type,
31-
default=None,
32-
help="certificate file for TLS (in PEM format)",
33-
)
34-
@click.option(
35-
"--tls-key",
36-
type=pem_file_option_type,
37-
default=None,
38-
help="private key file for TLS (in PEM format)",
39-
)
40-
@click.option("--dashboard-address", type=str, default=":0", help="dashboard address")
41-
@click.option(
42-
"--dashboard/--no-dashboard",
43-
"dashboard",
44-
default=True,
45-
show_default=True,
46-
required=False,
47-
help="Launch dashboard",
22+
@click.argument(
23+
"preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
4824
)
4925
@click.option(
5026
"--host",
5127
type=str,
5228
default=None,
53-
help="Serving host. Should be an ip address that is"
54-
" visible to the scheduler and other workers. "
55-
"See --listen-address and --contact-address if you "
56-
"need different listen and contact addresses. "
57-
"See --interface.",
29+
help="""IP address of serving host; should be visible to the scheduler and other
30+
workers. Can be a string (like ``"127.0.0.1"``) or ``None`` to fall back on the
31+
address of the interface specified by ``--interface`` or the default interface.""",
5832
)
5933
@click.option(
60-
"--interface",
61-
type=str,
62-
default=None,
63-
help="The external interface used to connect to the scheduler, usually "
64-
"an ethernet interface is used for connection, and not an InfiniBand "
65-
"interface (if one is available).",
34+
"--nthreads",
35+
type=int,
36+
default=1,
37+
show_default=True,
38+
help="Number of threads to be used for each Dask worker process.",
6639
)
67-
@click.option("--nthreads", type=int, default=1, help="Number of threads per process.")
6840
@click.option(
6941
"--name",
7042
type=str,
7143
default=None,
72-
help="A unique name for this worker like 'worker-1'. "
73-
"If used with --nprocs then the process number "
74-
"will be appended like name-0, name-1, name-2, ...",
44+
help="""A unique name for the worker. Can be a string (like ``"worker-1"``) or
45+
``None`` for a nameless worker. If used with ``--nprocs``, then the process number
46+
will be appended to the worker name, e.g. ``"worker-1-0"``, ``"worker-1-1"``,
47+
``"worker-1-2"``.""",
7548
)
7649
@click.option(
7750
"--memory-limit",
7851
default="auto",
79-
help="Bytes of memory per process that the worker can use. "
80-
"This can be an integer (bytes), "
81-
"float (fraction of total system memory), "
82-
"string (like 5GB or 5000M), "
83-
"'auto', or zero for no memory management",
52+
show_default=True,
53+
help="""Bytes of memory per process that the worker can use. Can be an integer
54+
(bytes), float (fraction of total system memory), string (like ``"5GB"`` or
55+
``"5000M"``), or ``"auto"`` or 0 for no memory management.""",
8456
)
8557
@click.option(
8658
"--device-memory-limit",
8759
default="0.8",
88-
help="Specifies the size of the CUDA device LRU cache, which "
89-
"is used to determine when the worker starts spilling to host "
90-
"memory. This can be a float (fraction of total device "
91-
"memory), an integer (bytes), a string (like 5GB or 5000M), "
92-
"and 'auto' or 0 to disable spilling to host (i.e., allow "
93-
"full device memory usage). Default is 0.8, 80% of the "
94-
"worker's total device memory.",
60+
show_default=True,
61+
help="""Size of the CUDA device LRU cache, which is used to determine when the
62+
worker starts spilling to host memory. Can be an integer (bytes), float (fraction of
63+
total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
64+
disable spilling to host (i.e. allow full device memory usage).""",
9565
)
9666
@click.option(
9767
"--rmm-pool-size",
9868
default=None,
99-
help="If specified, initialize each worker with an RMM pool of "
100-
"the given size, otherwise no RMM pool is created. This can be "
101-
"an integer (bytes) or string (like 5GB or 5000M)."
102-
"NOTE: This size is a per worker (i.e., per GPU) configuration, "
103-
"and not cluster-wide!",
69+
help="""RMM pool size to initialize each worker with. Can be an integer (bytes),
70+
string (like ``"5GB"`` or ``"5000M"``), or ``None`` to disable RMM pools.
71+
72+
.. note::
73+
This size is a per-worker configuration, and not cluster-wide.""",
10474
)
10575
@click.option(
10676
"--rmm-managed-memory/--no-rmm-managed-memory",
10777
default=False,
108-
help="If enabled, initialize each worker with RMM and set it to "
109-
"use managed memory. If disabled, RMM may still be used if "
110-
"--rmm-pool-size is specified, but in that case with default "
111-
"(non-managed) memory type."
112-
"WARNING: managed memory is currently incompatible with NVLink, "
113-
"trying to enable both will result in an exception.",
78+
show_default=True,
79+
help="""Initialize each worker with RMM and set it to use managed memory. If
80+
disabled, RMM may still be used by specifying ``--rmm-pool-size``.
81+
82+
.. warning::
83+
Managed memory is currently incompatible with NVLink. Trying to enable both will
84+
result in failure.""",
11485
)
11586
@click.option(
11687
"--rmm-async/--no-rmm-async",
@@ -119,100 +90,163 @@
11990
help="""Initialize each worker withh RMM and set it to use RMM's asynchronous
12091
allocator. See ``rmm.mr.CudaAsyncMemoryResource`` for more info.
12192
122-
.. note::
93+
.. warning::
12394
The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also
12495
incompatible with RMM pools and managed memory, trying to enable both will
12596
result in failure.""",
12697
)
12798
@click.option(
12899
"--rmm-log-directory",
129100
default=None,
130-
help="Directory to write per-worker RMM log files to; the client "
131-
"and scheduler are not logged here."
132-
"NOTE: Logging will only be enabled if --rmm-pool-size, "
133-
"--rmm-managed-memory, or --rmm-async are specified.",
101+
help="""Directory to write per-worker RMM log files to. The client and scheduler are
102+
not logged here. Can be a string (like ``"/path/to/logs/"``) or ``None`` to disable
103+
logging.
104+
105+
.. note::
106+
Logging will only be enabled if ``--rmm-pool-size`` or ``--rmm-managed-memory``
107+
are specified.""",
108+
)
109+
@click.option(
110+
"--pid-file", type=str, default="", help="File to write the process PID.",
111+
)
112+
@click.option(
113+
"--resources",
114+
type=str,
115+
default="",
116+
help="""Resources for task constraints like ``"GPU=2 MEM=10e9"``. Resources are
117+
applied separately to each worker process (only relevant when starting multiple
118+
worker processes with ``--nprocs``).""",
134119
)
135120
@click.option(
136-
"--reconnect/--no-reconnect",
121+
"--dashboard/--no-dashboard",
122+
"dashboard",
137123
default=True,
138-
help="Reconnect to scheduler if disconnected",
124+
show_default=True,
125+
required=False,
126+
help="Launch the dashboard.",
139127
)
140-
@click.option("--pid-file", type=str, default="", help="File to write the process PID")
141128
@click.option(
142-
"--local-directory", default=None, type=str, help="Directory to place worker files"
129+
"--dashboard-address",
130+
type=str,
131+
default=":0",
132+
show_default=True,
133+
help="Relative address to serve the dashboard (if enabled).",
143134
)
144135
@click.option(
145-
"--resources",
136+
"--local-directory",
137+
default=None,
146138
type=str,
147-
default="",
148-
help='Resources for task constraints like "GPU=2 MEM=10e9". '
149-
"Resources are applied separately to each worker process "
150-
"(only relevant when starting multiple worker processes with '--nprocs').",
139+
help="""Path on local machine to store temporary files. Can be a string (like
140+
``"path/to/files"``) or ``None`` to fall back on the value of
141+
``dask.temporary-directory`` in the local Dask configuration, using the current
142+
working directory if this is not set.""",
151143
)
152144
@click.option(
153145
"--scheduler-file",
154146
type=str,
155147
default="",
156-
help="Filename to JSON encoded scheduler information. "
157-
"Use with dask-scheduler --scheduler-file",
148+
help="""Filename to JSON encoded scheduler information. To be used in conjunction
149+
with the equivalent ``dask-scheduler`` option.""",
158150
)
159151
@click.option(
160-
"--dashboard-prefix", type=str, default=None, help="Prefix for the Dashboard"
152+
"--interface",
153+
type=str,
154+
default=None,
155+
help="""External interface used to connect to the scheduler. Usually an ethernet
156+
interface is used for connection, and not an InfiniBand interface (if one is
157+
available). Can be a string (like ``"eth0"`` for NVLink or ``"ib0"`` for
158+
InfiniBand) or ``None`` to fall back on the default interface.""",
161159
)
162160
@click.option(
163161
"--preload",
164162
type=str,
165163
multiple=True,
166164
is_eager=True,
167-
help="Module that should be loaded by each worker process "
168-
'like "foo.bar" or "/path/to/foo.py"',
165+
help="""Module that should be loaded by each worker process like ``"foo.bar"`` or
166+
``"/path/to/foo.py"``.""",
169167
)
170-
@click.argument(
171-
"preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
168+
@click.option(
169+
"--dashboard-prefix",
170+
type=str,
171+
default=None,
172+
help="""Prefix for the dashboard. Can be a string (like ...) or ``None`` for no
173+
prefix.""",
174+
)
175+
@click.option(
176+
"--tls-ca-file",
177+
type=pem_file_option_type,
178+
default=None,
179+
help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like
180+
``"path/to/certs"``), or ``None`` for no certificate(s).""",
181+
)
182+
@click.option(
183+
"--tls-cert",
184+
type=pem_file_option_type,
185+
default=None,
186+
help="""Certificate file for TLS (in PEM format). Can be a string (like
187+
``"path/to/certs"``), or ``None`` for no certificate(s).""",
188+
)
189+
@click.option(
190+
"--tls-key",
191+
type=pem_file_option_type,
192+
default=None,
193+
help="""Private key file for TLS (in PEM format). Can be a string (like
194+
``"path/to/certs"``), or ``None`` for no private key.""",
172195
)
173196
@click.option(
174197
"--enable-tcp-over-ucx/--disable-tcp-over-ucx",
175198
default=False,
176-
help="Enable TCP communication over UCX",
199+
show_default=True,
200+
help="""Set environment variables to enable TCP over UCX, even if InfiniBand and
201+
NVLink are not supported or disabled.""",
177202
)
178203
@click.option(
179204
"--enable-infiniband/--disable-infiniband",
180205
default=False,
181-
help="Enable InfiniBand communication",
206+
show_default=True,
207+
help="""Set environment variables to enable UCX over InfiniBand, implies
208+
``--enable-tcp-over-ucx``.""",
182209
)
183210
@click.option(
184-
"--enable-rdmacm/--disable-rdmacm",
211+
"--enable-nvlink/--disable-nvlink",
185212
default=False,
186-
help="Enable RDMA connection manager, currently requires InfiniBand enabled.",
213+
show_default=True,
214+
help="""Set environment variables to enable UCX over NVLink, implies
215+
``--enable-tcp-over-ucx``.""",
187216
)
188217
@click.option(
189-
"--enable-nvlink/--disable-nvlink",
218+
"--enable-rdmacm/--disable-rdmacm",
190219
default=False,
191-
help="Enable NVLink communication",
220+
show_default=True,
221+
help="""Set environment variables to enable UCX RDMA connection manager support,
222+
requires ``--enable-infiniband``.""",
192223
)
193224
@click.option(
194225
"--net-devices",
195226
type=str,
196227
default=None,
197-
help="When None (default), 'UCX_NET_DEVICES' will be left to its default. "
198-
"Otherwise, it must be a non-empty string with the interface name, such as "
199-
"such as 'eth0' or 'auto' to allow for automatically choosing the closest "
200-
"interface based on the system's topology. Normally used only with "
201-
"--enable-infiniband to specify the interface to be used by the worker, "
202-
"such as 'mlx5_0:1' or 'ib0'. "
203-
"WARNING: 'auto' requires UCX-Py to be installed and compiled with hwloc "
204-
"support. Additionally that will always use the closest interface, and "
205-
"that may cause unexpected errors if that interface is not properly "
206-
"configured or is disconnected, for that reason it's limited to "
207-
"InfiniBand only and will still cause unpredictable errors if not _ALL_ "
208-
"interfaces are connected and properly configured.",
228+
help="""Interface(s) used by workers for UCX communication. Can be a string (like
229+
``"eth0"`` for NVLink or ``"mlx5_0:1"``/``"ib0"`` for InfiniBand), ``"auto"``
230+
(requires ``--enable-infiniband``) to pick the optimal interface per-worker based on
231+
the system's topology, or ``None`` to stay with the default value of ``"all"`` (use
232+
all available interfaces).
233+
234+
.. warning::
235+
``"auto"`` requires UCX-Py to be installed and compiled with hwloc support.
236+
Unexpected errors can occur when using ``"auto"`` if any interfaces are
237+
disconnected or improperly configured.""",
209238
)
210239
@click.option(
211240
"--enable-jit-unspill/--disable-jit-unspill",
212241
default=None,
213-
help="Enable just-in-time unspilling. This is experimental and doesn't "
214-
"support memory spilling to disk Please see proxy_object.ProxyObject "
215-
"and proxify_host_file.ProxifyHostFile.",
242+
help="""Enable just-in-time unspilling. Can be a boolean or ``None`` to fall back on
243+
the value of ``dask.jit-unspill`` in the local Dask configuration, disabling
244+
unspilling if this is not set.
245+
246+
.. note::
247+
This is experimental and doesn't support memory spilling to disk. See
248+
``proxy_object.ProxyObject`` and ``proxify_host_file.ProxifyHostFile`` for more
249+
info.""",
216250
)
217251
def main(
218252
scheduler,

0 commit comments

Comments
 (0)