Skip to content

Commit 3cfec1d

Browse files
authored
Merge branch 'main' into topic/rm-ucx-mo
Signed-off-by: Raul Akhmetshin <[email protected]>
2 parents c388998 + acc9b23 commit 3cfec1d

File tree

27 files changed

+2895
-1548
lines changed

27 files changed

+2895
-1548
lines changed

benchmark/kvbench/commands/args.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ def nixl_bench_args(func):
214214
type=str,
215215
help="Comma-separated GPU CUDA device id to use for communication (only used with GPUNETIO backend)",
216216
)(func)
217+
func = click.option(
218+
"--gpunetio_oob_list",
219+
type=str,
220+
help="OOB network interface name for control path (only used with GPUNETIO backend)",
221+
)(func)
217222
func = click.option(
218223
"--hf3fs_iopool_size",
219224
type=int,

benchmark/kvbench/commands/nixlbench.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def __init__(
6363
benchmark_group="default",
6464
gds_mt_num_threads=1,
6565
gpunetio_device_list="0",
66+
gpunetio_oob_list="",
6667
hf3fs_iopool_size=64,
6768
obj_access_key="",
6869
obj_secret_key="",
@@ -115,6 +116,7 @@ def __init__(
115116
worker_type (str, optional): Type of worker. Defaults to "nixl".
116117
gds_mt_num_threads (int, optional): Number of threads for GDS_MT plugin. Defaults to 1.
117118
gpunetio_device_list (str, optional): GPU device list for GPUNETIO plugin. Defaults to "0".
119+
gpunetio_oob_list (str, optional): OOB network interface name for control path for GPUNETIO plugin. Defaults to "".
118120
hf3fs_iopool_size (int, optional): IO pool size for HF3FS plugin. Defaults to 64.
119121
obj_access_key (str, optional): Access key for OBJ/S3 plugin. Defaults to "".
120122
obj_secret_key (str, optional): Secret key for OBJ/S3 plugin. Defaults to "".
@@ -162,6 +164,7 @@ def __init__(
162164
self.worker_type = worker_type
163165
self.gds_mt_num_threads = gds_mt_num_threads
164166
self.gpunetio_device_list = gpunetio_device_list
167+
self.gpunetio_oob_list = gpunetio_oob_list
165168
self.hf3fs_iopool_size = hf3fs_iopool_size
166169
self.obj_access_key = obj_access_key
167170
self.obj_secret_key = obj_secret_key
@@ -320,6 +323,7 @@ def _params(self):
320323
"worker_type": self.worker_type,
321324
"gds_mt_num_threads": self.gds_mt_num_threads,
322325
"gpunetio_device_list": self.gpunetio_device_list,
326+
"gpunetio_oob_list": self.gpunetio_oob_list,
323327
"hf3fs_iopool_size": self.hf3fs_iopool_size,
324328
"obj_access_key": self.obj_access_key,
325329
"obj_secret_key": self.obj_secret_key,
@@ -379,6 +383,7 @@ def defaults():
379383
"benchmark_group": "default",
380384
"gds_mt_num_threads": 1,
381385
"gpunetio_device_list": "0",
386+
"gpunetio_oob_list": "",
382387
"hf3fs_iopool_size": 64,
383388
"obj_access_key": "",
384389
"obj_secret_key": "",

benchmark/nixlbench/README.md

Lines changed: 95 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ A comprehensive benchmarking tool for the NVIDIA Inference Xfer Library (NIXL) t
3333
## Features
3434

3535
- **Multiple Communication Backends**: UCX, GPUNETIO, Mooncake, Libfabric for network communication
36-
- **Storage Backend Support**: GDS, GDS_MT, POSIX, HF3FS, OBJ (S3) for storage operations
36+
- **Storage Backend Support**: GDS, GDS_MT, POSIX, HF3FS, OBJ (S3), GUSLI for storage operations
3737
- **Flexible Communication Patterns**:
3838
- **Pairwise**: Point-to-point communication between pairs
3939
- **Many-to-one**: Multiple initiators to single target
@@ -200,6 +200,7 @@ For development environments or when Docker is not available.
200200
- **DOCA**: NVIDIA DOCA SDK for GPUNetIO
201201
- **AWS SDK C++**: For S3 object storage backend
202202
- **GDS**: NVIDIA GPUDirect Storage
203+
- **GUSLI**: G3+ User Space Access Library for direct block device access
203204
- **NVSHMEM**: Required for NVSHMEM worker type
204205
- **hwloc**: Hardware locality detection (required for Libfabric only)
205206

@@ -304,6 +305,21 @@ sudo dpkg -i doca-host_3.1.0-091000-25.07-ubuntu2404_amd64.deb
304305
sudo apt-get update && sudo apt-get install -y doca-sdk-gpunetio libdoca-sdk-gpunetio-dev
305306
```
306307

308+
**GUSLI (Optional - for GUSLI backend):**
309+
```bash
310+
# Clone and build GUSLI
311+
git clone https://github.com/nvidia/gusli.git
312+
cd gusli
313+
make all BUILD_RELEASE=1 BUILD_FOR_UNITEST=0 VERBOSE=1 ALLOW_USE_URING=0
314+
315+
# Install library and headers
316+
sudo cp libgusli_clnt.so /usr/lib/
317+
sudo cp gusli_*.hpp /usr/include/
318+
sudo ldconfig
319+
```
320+
321+
**Note**: GUSLI must be built before building NIXL. See [GUSLI Plugin README](../../src/plugins/gusli/README.md) for detailed installation and usage instructions.
322+
307323
#### Python Environment Setup
308324
```bash
309325
# Install uv (modern Python package manager)
@@ -404,7 +420,7 @@ sudo systemctl start etcd && sudo systemctl enable etcd
404420
```
405421
--runtime_type NAME # Type of runtime to use [ETCD] (default: ETCD)
406422
--worker_type NAME # Worker to use to transfer data [nixl, nvshmem] (default: nixl)
407-
--backend NAME # Communication backend [UCX, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ] (default: UCX)
423+
--backend NAME # Communication backend [UCX, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ, GUSLI] (default: UCX)
408424
--benchmark_group NAME # Name of benchmark group for parallel runs (default: default)
409425
--etcd_endpoints URL # ETCD server URL for coordination (default: http://localhost:2379)
410426
```
@@ -486,13 +502,26 @@ sudo systemctl start etcd && sudo systemctl enable etcd
486502
--obj_req_checksum TYPE # Required checksum for S3 backend [supported, required] (default: supported)
487503
```
488504

505+
**GUSLI Backend:**
506+
```
507+
--device_list LIST # Device specs in format 'id:type:path' (e.g., '11:F:./store0.bin,27:K:/dev/nvme0n1')
508+
# Type: F (file), K (kernel device), N (networked server with t/u prefix)
509+
--gusli_client_name NAME # Client identifier (default: NIXLBench)
510+
--gusli_max_simultaneous_requests NUM # Concurrent request limit (default: 32)
511+
--gusli_device_security LIST # Comma-separated security flags per device (e.g., 'sec=0x3,sec=0x71')
512+
--gusli_bdev_byte_offset BYTES # Starting LBA offset in bytes (default: 1048576)
513+
--gusli_config_file CONTENT # Custom config file content (auto-generated if not provided)
514+
515+
Note: storage_enable_direct is automatically enabled for GUSLI backend
516+
```
517+
489518
### Using ETCD for Coordination
490519

491520
NIXL Benchmark uses an ETCD key-value store for coordination between benchmark workers. This is useful in containerized or cloud-native environments.
492521

493522
**ETCD Requirements:**
494523
- **Required**: Network backends (UCX, GPUNETIO, Mooncake, Libfabric) and multi-node setups
495-
- **Optional**: Storage backends (GDS, GDS_MT, POSIX, HF3FS, OBJ, S3) running as single instances
524+
- **Optional**: Storage backends (GDS, GDS_MT, POSIX, HF3FS, OBJ, GUSLI) running as single instances
496525
- **Required**: Storage backends when `--etcd_endpoints` is explicitly specified
497526

498527
**For multi-node benchmarks:**
@@ -573,6 +602,69 @@ The workers automatically coordinate ranks through ETCD as they connect.
573602
./nixlbench --backend POSIX --filepath /mnt/storage/testfile --posix_api_type URING --storage_enable_direct
574603
```
575604

605+
**GUSLI Backend (G3+ User Space Access Library):**
606+
607+
GUSLI provides direct user-space access to block storage devices, supporting local files, kernel block devices, and networked GUSLI servers.
608+
609+
**Note**: Direct I/O is automatically enabled when GUSLI backend is selected (no need to specify `--storage_enable_direct`).
610+
611+
```bash
612+
# Basic GUSLI benchmark - single file device
613+
./nixlbench --backend=GUSLI \
614+
--device_list="11:F:./store0.bin" \
615+
--num_initiator_dev=1 \
616+
--num_target_dev=1 \
617+
--op_type=WRITE
618+
619+
# NVMe device with custom security
620+
./nixlbench --backend=GUSLI \
621+
--device_list="27:K:/dev/nvme0n1" \
622+
--gusli_device_security="sec=0x7" \
623+
--num_initiator_dev=1 \
624+
--num_target_dev=1 \
625+
--op_type=READ
626+
627+
# Multi-device configuration
628+
./nixlbench --backend=GUSLI \
629+
--device_list="11:F:./store0.bin,14:K:/dev/zero,27:K:/dev/nvme0n1" \
630+
--gusli_device_security="sec=0x3,sec=0x71,sec=0x7" \
631+
--num_initiator_dev=3 \
632+
--num_target_dev=3 \
633+
--op_type=WRITE
634+
635+
# Networked GUSLI server (TCP)
636+
./nixlbench --backend=GUSLI \
637+
--device_list="20:N:t192.168.1.100" \
638+
--gusli_device_security="sec=0x10" \
639+
--num_initiator_dev=1 \
640+
--num_target_dev=1 \
641+
--op_type=WRITE
642+
643+
# High concurrency with multiple threads
644+
./nixlbench --backend=GUSLI \
645+
--device_list="27:K:/dev/nvme0n1" \
646+
--gusli_max_simultaneous_requests=128 \
647+
--num_threads=8 \
648+
--total_buffer_size=$((16*1024*1024*1024)) \
649+
--op_type=WRITE
650+
```
651+
652+
**GUSLI Device Types:**
653+
- `F`: File-backed storage (e.g., `11:F:./store0.bin`)
654+
- `K`: Kernel block device (e.g., `27:K:/dev/nvme0n1`, `14:K:/dev/zero`)
655+
- `N`: Networked GUSLI server with protocol prefix (e.g., `20:N:t192.168.1.100` for TCP, `21:N:u10.0.0.5` for UDP)
656+
657+
**GUSLI-Specific Parameters:**
658+
- `--gusli_client_name`: Client identifier (default: "NIXLBench")
659+
- `--gusli_max_simultaneous_requests`: Concurrent request limit (default: 32)
660+
- `--gusli_device_security`: Comma-separated security flags per device (default: "sec=0x3" for each device)
661+
- `--gusli_bdev_byte_offset`: Starting LBA offset in bytes (default: 1MB)
662+
- `--gusli_config_file`: Custom config file content override
663+
664+
**Notes**:
665+
- Number of devices in `--device_list` must match `--num_initiator_dev` and `--num_target_dev`
666+
- Direct I/O is automatically enabled for GUSLI (no need to specify `--storage_enable_direct`)
667+
576668
### Worker Types
577669

578670
**NVSHMEM Worker:**

benchmark/nixlbench/src/utils/utils.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ DEFINE_string(
5050
"Name of NIXL backend [UCX, GDS, GDS_MT, POSIX, GPUNETIO, Mooncake, HF3FS, OBJ, GUSLI] \
5151
(only used with nixl worker)");
5252
DEFINE_string(initiator_seg_type, XFERBENCH_SEG_TYPE_DRAM, "Type of memory segment for initiator \
53-
[DRAM, VRAM, BLK]");
53+
[DRAM, VRAM]. Note: Storage backends always use DRAM locally.");
5454
DEFINE_string(target_seg_type, XFERBENCH_SEG_TYPE_DRAM, "Type of memory segment for target \
55-
[DRAM, VRAM, BLK]");
55+
[DRAM, VRAM]. Note: Storage backends determine remote type automatically.");
5656
DEFINE_string(scheme, XFERBENCH_SCHEME_PAIRWISE, "Scheme: pairwise, maytoone, onetomany, tp");
5757
DEFINE_string(mode, XFERBENCH_MODE_SG, "MODE: SG (Single GPU per proc), MG (Multi GPU per proc) [default: SG]");
5858
DEFINE_string(op_type, XFERBENCH_OP_WRITE, "Op type: READ, WRITE");
@@ -109,6 +109,9 @@ DEFINE_string (posix_api_type,
109109
// DOCA GPUNetIO options - only used when backend is DOCA GPUNetIO
110110
DEFINE_string(gpunetio_device_list, "0", "Comma-separated GPU CUDA device id to use for \
111111
communication (only used with nixl worker)");
112+
// DOCA GPUNetIO options - only used when backend is DOCA GPUNetIO
113+
DEFINE_string(gpunetio_oob_list, "", "Comma-separated OOB network interface name \
114+
for control path (only used with nixl worker)");
112115

113116
// OBJ options - only used when backend is OBJ
114117
DEFINE_string(obj_access_key, "", "Access key for S3 backend");
@@ -132,12 +135,19 @@ DEFINE_string(gusli_client_name, "NIXLBench", "Client name for GUSLI backend");
132135
DEFINE_int32(gusli_max_simultaneous_requests,
133136
32,
134137
"Maximum number of simultaneous requests for GUSLI backend");
135-
DEFINE_string(gusli_config_file,
136-
"",
137-
"Configuration file content for GUSLI backend (if empty, uses default config)");
138+
DEFINE_string(
139+
gusli_config_file,
140+
"",
141+
"Configuration file content for GUSLI backend (if empty, auto-generated from device_list)");
138142
DEFINE_uint64(gusli_bdev_byte_offset,
139143
1048576,
140144
"Byte offset in block device for GUSLI operations (default: 1MB)");
145+
DEFINE_string(gusli_device_security,
146+
"",
147+
"Comma-separated list of security flags per device (e.g. 'sec=0x3,sec=0x71'). "
148+
"If empty or fewer than devices, uses 'sec=0x3' as default. "
149+
"For GUSLI backend, use device_list in format 'id:type:path' where type is F (file) "
150+
"or K (kernel device).");
141151

142152
std::string xferBenchConfig::runtime_type = "";
143153
std::string xferBenchConfig::worker_type = "";
@@ -169,6 +179,7 @@ int xferBenchConfig::gds_batch_pool_size = 0;
169179
int xferBenchConfig::gds_batch_limit = 0;
170180
int xferBenchConfig::gds_mt_num_threads = 0;
171181
std::string xferBenchConfig::gpunetio_device_list = "";
182+
std::string xferBenchConfig::gpunetio_oob_list = "";
172183
std::vector<std::string> devices = { };
173184
int xferBenchConfig::num_files = 0;
174185
std::string xferBenchConfig::posix_api_type = "";
@@ -190,6 +201,7 @@ std::string xferBenchConfig::gusli_client_name = "";
190201
int xferBenchConfig::gusli_max_simultaneous_requests = 0;
191202
std::string xferBenchConfig::gusli_config_file = "";
192203
uint64_t xferBenchConfig::gusli_bdev_byte_offset = 0;
204+
std::string xferBenchConfig::gusli_device_security = "";
193205

194206
int
195207
xferBenchConfig::loadFromFlags() {
@@ -237,6 +249,7 @@ xferBenchConfig::loadFromFlags() {
237249
// Load DOCA-specific configurations if backend is DOCA
238250
if (backend == XFERBENCH_BACKEND_GPUNETIO) {
239251
gpunetio_device_list = FLAGS_gpunetio_device_list;
252+
gpunetio_oob_list = FLAGS_gpunetio_oob_list;
240253
}
241254

242255
// Load HD3FS-specific configurations if backend is HD3FS
@@ -250,6 +263,7 @@ xferBenchConfig::loadFromFlags() {
250263
gusli_max_simultaneous_requests = FLAGS_gusli_max_simultaneous_requests;
251264
gusli_config_file = FLAGS_gusli_config_file;
252265
gusli_bdev_byte_offset = FLAGS_gusli_bdev_byte_offset;
266+
gusli_device_security = FLAGS_gusli_device_security;
253267
}
254268

255269
// Load OBJ-specific configurations if backend is OBJ
@@ -466,6 +480,8 @@ xferBenchConfig::printConfig() {
466480
if (backend == XFERBENCH_BACKEND_GPUNETIO) {
467481
printOption ("GPU CUDA Device id list (--device_list=dev1,dev2,...)",
468482
gpunetio_device_list);
483+
printOption("OOB network interface name for control path (--oob_list=ifface)",
484+
gpunetio_oob_list);
469485
}
470486
}
471487
printOption ("Initiator seg type (--initiator_seg_type=[DRAM,VRAM])", initiator_seg_type);

benchmark/nixlbench/src/utils/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ class xferBenchConfig {
157157
static int gds_batch_limit;
158158
static int gds_mt_num_threads;
159159
static std::string gpunetio_device_list;
160+
static std::string gpunetio_oob_list;
160161
static long page_size;
161162
static std::string obj_access_key;
162163
static std::string obj_secret_key;
@@ -173,6 +174,7 @@ class xferBenchConfig {
173174
static int gusli_max_simultaneous_requests;
174175
static std::string gusli_config_file;
175176
static uint64_t gusli_bdev_byte_offset;
177+
static std::string gusli_device_security;
176178

177179
static int
178180
loadFromFlags();

0 commit comments

Comments
 (0)