From a7a6f00857a9322caf02879ed2cb46bd4b8223ff Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 07:39:22 +0000 Subject: [PATCH 01/32] UCP/PERF: Added config for block count --- src/tools/perf/api/libperf.h | 1 + src/tools/perf/perftest.c | 1 + src/tools/perf/perftest_params.c | 76 ++++++++++++++++++++++---------- 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h index bec2786007b..62ea68b9f11 100644 --- a/src/tools/perf/api/libperf.h +++ b/src/tools/perf/api/libperf.h @@ -284,6 +284,7 @@ typedef struct ucx_perf_params { double percentile_rank; /* The percentile rank of the percentile reported in latency tests */ unsigned device_thread_count; /* Number of device threads */ + unsigned device_block_count; /* Number of device blocks */ void *rte_group; /* Opaque RTE group handle */ ucx_perf_rte_t *rte; /* RTE functions used to exchange data */ diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c index ca514beaaeb..67cb6543291 100644 --- a/src/tools/perf/perftest.c +++ b/src/tools/perf/perftest.c @@ -209,6 +209,7 @@ ucs_status_t init_test_params(perftest_params_t *params) params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG; params->super.ucp.am_hdr_size = 0; params->super.device_thread_count = 1; + params->super.device_block_count = 1; params->super.ucp.is_daemon_mode = 0; params->super.ucp.dmn_local_addr = empty_addr; params->super.ucp.dmn_remote_addr = empty_addr; diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c index 6b431b1d526..0ba96a64f1c 100644 --- a/src/tools/perf/perftest_params.c +++ b/src/tools/perf/perftest_params.c @@ -131,8 +131,9 @@ static void usage(const struct perftest_context *ctx, const char *program) printf(" signal - signal-based timer\n"); printf("\n"); printf(" UCP only:\n"); - printf(" -T number of threads in the test (%d)\n", + printf(" -T [:] number of threads in the test (%d)\n", ctx->params.super.thread_count); + printf(" blocks is optional, it corresponds to the number of device blocks\n"); printf(" -M thread support level for progress engine (single)\n"); printf(" single - only the master thread can access\n"); printf(" serialized - one thread can access at a time\n"); @@ -169,23 +170,30 @@ static void usage(const struct perftest_context *ctx, const char *program) printf("\n"); } -static ucs_status_t parse_device_id(const char *opt_arg, int *device_id) +static ucs_status_t parse_int(const char *opt_arg, int *value, const char *desc, + int min_value, int max_value) { char *endptr; - int parsed_device_id; + int parsed_value; if (opt_arg == NULL) { - ucs_error("device id string is NULL"); + ucs_error("%s string is NULL", desc); return UCS_ERR_INVALID_PARAM; } - parsed_device_id = strtol(opt_arg, &endptr, 10); - if ((endptr == opt_arg) || (*endptr != '\0') || (parsed_device_id < 0)) { - ucs_error("Failed to parse device id: %s", opt_arg); + parsed_value = strtol(opt_arg, &endptr, 10); + if ((endptr == opt_arg) || (*endptr != '\0')) { + ucs_error("failed to parse %s: %s", desc, opt_arg); return UCS_ERR_INVALID_PARAM; } - *device_id = parsed_device_id; + if ((parsed_value < min_value) || (parsed_value > max_value)) { + ucs_error("value for %s (%s) is out of range: [%d, %d]", desc, opt_arg, + min_value, max_value); + return UCS_ERR_INVALID_PARAM; + } + + *value = parsed_value; return UCS_OK; } @@ -235,7 +243,7 @@ parse_accel_device(char *opt_arg, ucx_perf_accel_dev_t *dev) if (NULL == token) { device_id = UCX_PERF_MEM_DEV_DEFAULT; } else { - status = parse_device_id(token, &device_id); + status = parse_int(token, &device_id, "device id", 0, INT_MAX); if (status != UCS_OK) { return status; } @@ -307,6 +315,36 @@ static ucs_status_t parse_accel_device_params(const char *opt_arg, return UCS_OK; } +static ucs_status_t parse_thread_params(const char *opt_arg, + unsigned *thread_count, + unsigned *block_count) +{ + const char *delim = ":"; + char *saveptr = NULL; + char *token, *arg; + ucs_status_t status; + + arg = ucs_alloca(strlen(opt_arg) + 1); + strcpy(arg, opt_arg); + token = strtok_r(arg, delim, &saveptr); + status = parse_int(token, thread_count, "thread count", 1, INT_MAX); + if (status != UCS_OK) { + return status; + } + + token = strtok_r(NULL, delim, &saveptr); + if (token != NULL) { + status = parse_int(token, block_count, "block count", 1, INT_MAX); + if (status != UCS_OK) { + return status; + } + } else { + *block_count = 1; + } + + return UCS_OK; +} + static ucs_status_t parse_message_sizes_params(const char *opt_arg, ucx_perf_params_t *params) { @@ -563,8 +601,8 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt, return UCS_ERR_INVALID_PARAM; } case 'T': - params->super.thread_count = atoi(opt_arg); - return UCS_OK; + return parse_thread_params(opt_arg, ¶ms->super.thread_count, + ¶ms->super.device_block_count); case 'A': if (!strcmp(opt_arg, "thread") || !strcmp(opt_arg, "thread_spinlock")) { params->super.async_mode = UCS_ASYNC_MODE_THREAD_SPINLOCK; @@ -597,19 +635,11 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt, return UCS_ERR_INVALID_PARAM; } case 'm': - if (UCS_OK != parse_mem_type_params(opt_arg, - ¶ms->super.send_mem_type, - ¶ms->super.recv_mem_type)) { - return UCS_ERR_INVALID_PARAM; - } - return UCS_OK; + return parse_mem_type_params(opt_arg, ¶ms->super.send_mem_type, + ¶ms->super.recv_mem_type); case 'a': - if (UCS_OK != parse_accel_device_params(opt_arg, - ¶ms->super.send_device, - ¶ms->super.recv_device)) { - return UCS_ERR_INVALID_PARAM; - } - return UCS_OK; + return parse_accel_device_params(opt_arg, ¶ms->super.send_device, + ¶ms->super.recv_device); case 'y': params->super.flags |= UCX_PERF_TEST_FLAG_AM_RECV_COPY; return UCS_OK; From c27a9a867ab1cae46e10f315a6fcbd563c3c18e2 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 07:46:18 +0000 Subject: [PATCH 02/32] UCP/PERF: Added tests CMDs for single/partial --- src/tools/perf/api/libperf.h | 2 ++ src/tools/perf/cuda/cuda_kernel.cuh | 4 +++- src/tools/perf/lib/libperf.c | 4 ++++ src/tools/perf/lib/libperf_memory.c | 2 ++ src/tools/perf/perftest.c | 12 ++++++++++++ 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h index 62ea68b9f11..d76b859e016 100644 --- a/src/tools/perf/api/libperf.h +++ b/src/tools/perf/api/libperf.h @@ -30,7 +30,9 @@ typedef enum { typedef enum { UCX_PERF_CMD_AM, UCX_PERF_CMD_PUT, + UCX_PERF_CMD_PUT_SINGLE, UCX_PERF_CMD_PUT_MULTI, + UCX_PERF_CMD_PUT_PARTIAL, UCX_PERF_CMD_GET, UCX_PERF_CMD_ADD, UCX_PERF_CMD_FADD, diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index d79ed37a97f..bc4c79b8370 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -150,7 +150,9 @@ template ucs_status_t ucx_perf_cuda_dispatch(ucx_perf_context_t *perf) { Runner runner(*perf); - if (perf->params.command == UCX_PERF_CMD_PUT_MULTI) { + if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) || + (perf->params.command == UCX_PERF_CMD_PUT_SINGLE) || + (perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) { if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) { return runner.run_pingpong(); } else if (perf->params.test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) { diff --git a/src/tools/perf/lib/libperf.c b/src/tools/perf/lib/libperf.c index 7a81a9b2228..cd1b88c44a6 100644 --- a/src/tools/perf/lib/libperf.c +++ b/src/tools/perf/lib/libperf.c @@ -481,7 +481,9 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params, attr.cap.put.max_bcopy, attr.cap.put.max_zcopy); max_iov = attr.cap.put.max_iov; break; + case UCX_PERF_CMD_PUT_SINGLE: case UCX_PERF_CMD_PUT_MULTI: + case UCX_PERF_CMD_PUT_PARTIAL: min_size = 0; max_size = 0; max_iov = 0; @@ -885,7 +887,9 @@ static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params, message_size = ucx_perf_get_message_size(params); switch (params->command) { case UCX_PERF_CMD_PUT: + case UCX_PERF_CMD_PUT_SINGLE: case UCX_PERF_CMD_PUT_MULTI: + case UCX_PERF_CMD_PUT_PARTIAL: case UCX_PERF_CMD_GET: ucp_params->features |= UCP_FEATURE_RMA; break; diff --git a/src/tools/perf/lib/libperf_memory.c b/src/tools/perf/lib/libperf_memory.c index c74c7f79f13..0b2a14c1e5f 100644 --- a/src/tools/perf/lib/libperf_memory.c +++ b/src/tools/perf/lib/libperf_memory.c @@ -267,7 +267,9 @@ ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf) case UCX_PERF_CMD_PUT: flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT; break; + case UCX_PERF_CMD_PUT_SINGLE: case UCX_PERF_CMD_PUT_MULTI: + case UCX_PERF_CMD_PUT_PARTIAL: flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT | UCT_MD_MEM_ACCESS_REMOTE_ATOMIC; break; case UCX_PERF_CMD_GET: diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c index 67cb6543291..f79b6214676 100644 --- a/src/tools/perf/perftest.c +++ b/src/tools/perf/perftest.c @@ -80,12 +80,24 @@ test_type_t tests[] = { {"ucp_put_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, "put bandwidth", "overhead", 32}, + {"ucp_put_single_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE, UCX_PERF_TEST_TYPE_STREAM_UNI, + "put single bandwidth", "overhead", 32}, + + {"ucp_put_single_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE, UCX_PERF_TEST_TYPE_PINGPONG, + "put single latency", "latency", 1}, + {"ucp_put_multi_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_MULTI, UCX_PERF_TEST_TYPE_STREAM_UNI, "put multi bandwidth", "overhead", 32}, {"ucp_put_multi_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_MULTI, UCX_PERF_TEST_TYPE_PINGPONG, "put multi latency", "latency", 32}, + {"ucp_put_partial_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_PARTIAL, UCX_PERF_TEST_TYPE_STREAM_UNI, + "put partial bandwidth", "overhead", 32}, + + {"ucp_put_partial_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_PARTIAL, UCX_PERF_TEST_TYPE_PINGPONG, + "put partial latency", "latency", 32}, + {"ucp_get", UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, "get latency / bandwidth / message rate", "latency", 1}, From 0927c2f18703e0ea5f17e937830853bc3b81e065 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 07:48:44 +0000 Subject: [PATCH 03/32] UCP/PERF: Compact form for message sizes --- src/tools/perf/perftest_params.c | 63 ++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c index 0ba96a64f1c..eb188bb0583 100644 --- a/src/tools/perf/perftest_params.c +++ b/src/tools/perf/perftest_params.c @@ -74,6 +74,7 @@ static void usage(const struct perftest_context *ctx, const char *program) printf(" -s list of scatter-gather sizes for single message (%zu)\n", ctx->params.super.msg_size_list[0]); printf(" for example: \"-s 16,48,8192,8192,14\"\n"); + printf(" compact form example: \"-s 1024:16 expands to [1024, ..., 1024] with 16 elements\n"); printf(" -m [,]\n"); printf(" memory type of message for sender and receiver (host)\n"); print_memory_type_usage(); @@ -131,7 +132,8 @@ static void usage(const struct perftest_context *ctx, const char *program) printf(" signal - signal-based timer\n"); printf("\n"); printf(" UCP only:\n"); - printf(" -T [:] number of threads in the test (%d)\n", + printf(" -T [:]\n"); + printf(" number of threads in the test (%d)\n", ctx->params.super.thread_count); printf(" blocks is optional, it corresponds to the number of device blocks\n"); printf(" -M thread support level for progress engine (single)\n"); @@ -345,8 +347,8 @@ static ucs_status_t parse_thread_params(const char *opt_arg, return UCS_OK; } -static ucs_status_t parse_message_sizes_params(const char *opt_arg, - ucx_perf_params_t *params) +static ucs_status_t +parse_message_sizes_list(const char *opt_arg, ucx_perf_params_t *params) { const char delim = ','; size_t *msg_size_list, token_num, token_it; @@ -388,6 +390,61 @@ static ucs_status_t parse_message_sizes_params(const char *opt_arg, return UCS_OK; } +static ucs_status_t +parse_message_sizes_compact(const char *opt_arg, ucx_perf_params_t *params) +{ + const char *delim = ":"; + char *saveptr = NULL; + char *token, *arg; + int msg_size, element_count, i; + size_t *msg_size_list; + ucs_status_t status; + + + arg = ucs_alloca(strlen(opt_arg) + 1); + strcpy(arg, opt_arg); + token = strtok_r(arg, delim, &saveptr); + status = parse_int(token, &msg_size, "message size", 1, INT_MAX); + if (status != UCS_OK) { + return status; + } + + token = strtok_r(NULL, delim, &saveptr); + status = parse_int(token, &element_count, "elements", 1, INT_MAX); + if (status != UCS_OK) { + return status; + } + + msg_size_list = realloc(params->msg_size_list, + sizeof(*params->msg_size_list) * element_count); + if (NULL == msg_size_list) { + return UCS_ERR_NO_MEMORY; + } + + params->msg_size_list = msg_size_list; + for (i = 0; i < element_count; ++i) { + params->msg_size_list[i] = msg_size; + } + + params->msg_size_cnt = element_count; + return UCS_OK; +} + +static int is_compact_form(const char *input) +{ + return strchr(input, ':') != NULL; +} + +static ucs_status_t parse_message_sizes_params(const char *opt_arg, + ucx_perf_params_t *params) +{ + if (is_compact_form(opt_arg)) { + return parse_message_sizes_compact(opt_arg, params); + } + + return parse_message_sizes_list(opt_arg, params); +} + static ucs_status_t parse_ucp_datatype_params(const char *opt_arg, ucp_perf_datatype_t *datatype) { From 5a4be0f6232bd81d3c082a5d522c8753621342b2 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 08:02:01 +0000 Subject: [PATCH 04/32] UCP/PERF: Device level option --- src/tools/perf/api/libperf.h | 2 ++ src/tools/perf/perftest.c | 1 + src/tools/perf/perftest.h | 2 +- src/tools/perf/perftest_params.c | 22 ++++++++++++++++++++++ src/ucs/sys/device_code.h | 18 ++++++++++++++++++ 5 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h index d76b859e016..2e1338b2423 100644 --- a/src/tools/perf/api/libperf.h +++ b/src/tools/perf/api/libperf.h @@ -11,6 +11,7 @@ #define UCX_LIBPERF_H #include +#include BEGIN_C_DECLS @@ -267,6 +268,7 @@ typedef struct ucx_perf_params { ucs_memory_type_t recv_mem_type; /* Recv memory type */ ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */ ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */ + ucs_device_level_t device_level; /* Device level for gdaki */ unsigned flags; /* See ucx_perf_test_flags. */ size_t *msg_size_list; /* Test message sizes list. The size diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c index f79b6214676..00d782e5632 100644 --- a/src/tools/perf/perftest.c +++ b/src/tools/perf/perftest.c @@ -215,6 +215,7 @@ ucs_status_t init_test_params(perftest_params_t *params) params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST; params->super.send_device = default_dev; params->super.recv_device = default_dev; + params->super.device_level = UCS_DEVICE_LEVEL_THREAD; params->super.msg_size_cnt = 1; params->super.iov_stride = 0; params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG; diff --git a/src/tools/perf/perftest.h b/src/tools/perf/perftest.h index aa487b1aded..d61b44108fa 100644 --- a/src/tools/perf/perftest.h +++ b/src/tools/perf/perftest.h @@ -19,7 +19,7 @@ #endif #define TL_RESOURCE_NAME_NONE "" -#define TEST_PARAMS_ARGS "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:a:R:lyz" +#define TEST_PARAMS_ARGS "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:a:R:lyzL:" #define TEST_ID_UNDEFINED -1 #define DEFAULT_DAEMON_PORT 1338 diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c index eb188bb0583..6c3210d645e 100644 --- a/src/tools/perf/perftest_params.c +++ b/src/tools/perf/perftest_params.c @@ -71,6 +71,11 @@ static void usage(const struct perftest_context *ctx, const char *program) printf(" Accelerator device type and device id to use for running the test.\n"); printf(" device id is optional, it corresponds to the index of\n"); printf(" the device in the list of available devices\n"); + printf(" -L device cooperation level for gdaki (thread)\n"); + printf(" thread - thread level\n"); + printf(" warp - warp level\n"); + printf(" block - block level\n"); + printf(" grid - grid level\n"); printf(" -s list of scatter-gather sizes for single message (%zu)\n", ctx->params.super.msg_size_list[0]); printf(" for example: \"-s 16,48,8192,8192,14\"\n"); @@ -445,6 +450,21 @@ static ucs_status_t parse_message_sizes_params(const char *opt_arg, return parse_message_sizes_list(opt_arg, params); } +static ucs_status_t parse_device_level(const char *opt_arg, + ucs_device_level_t *device_level) +{ + ucs_device_level_t level; + for (level = UCS_DEVICE_LEVEL_THREAD; level <= UCS_DEVICE_LEVEL_GRID; ++level) { + if (!strcmp(opt_arg, ucs_device_level_name(level))) { + *device_level = level; + return UCS_OK; + } + } + + ucs_error("Invalid option argument for device level: %s", opt_arg); + return UCS_ERR_INVALID_PARAM; +} + static ucs_status_t parse_ucp_datatype_params(const char *opt_arg, ucp_perf_datatype_t *datatype) { @@ -697,6 +717,8 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt, case 'a': return parse_accel_device_params(opt_arg, ¶ms->super.send_device, ¶ms->super.recv_device); + case 'L': + return parse_device_level(opt_arg, ¶ms->super.device_level); case 'y': params->super.flags |= UCX_PERF_TEST_FLAG_AM_RECV_COPY; return UCS_OK; diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index d7aa07f3f5b..d1b5e5dff89 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -30,6 +30,24 @@ typedef enum { } ucs_device_level_t; +static UCS_F_ALWAYS_INLINE const char* +ucs_device_level_name(ucs_device_level_t level) +{ + switch (level) { + case UCS_DEVICE_LEVEL_THREAD: + return "thread"; + case UCS_DEVICE_LEVEL_WARP: + return "warp"; + case UCS_DEVICE_LEVEL_BLOCK: + return "block"; + case UCS_DEVICE_LEVEL_GRID: + return "grid"; + default: + return "unknown"; + } +} + + /* * Read a 64-bit atomic value from a global memory address. */ From 4c4a3a6ef173f02f01322185fd14f537558cfd80 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 08:22:41 +0000 Subject: [PATCH 05/32] UCP/PERF: Support for IOV --- src/tools/perf/cuda/cuda_kernel.cuh | 24 +++++++++ src/tools/perf/cuda/ucp_cuda_kernel.cu | 67 +++++++++++++++++++++----- 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index bc4c79b8370..8c7bfae2d60 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -13,6 +13,8 @@ #include #include +#include + typedef unsigned long long ucx_perf_cuda_time_t; @@ -88,6 +90,28 @@ UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) return bits; } +struct ucx_perf_cuda_element { + unsigned index; + void *address; + uint64_t remote_address; + size_t length; +}; + +struct ucx_perf_cuda_element_list { + size_t count; + ucx_perf_cuda_element elements[0]; +}; + +template +using ucx_perf_cuda_unique_ptr = std::unique_ptr; + +template +ucx_perf_cuda_unique_ptr ucx_perf_cuda_make_unique(size_t size) { + T* raw = nullptr; + CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &raw, size); + return ucx_perf_cuda_unique_ptr(raw, [](T* p){ if (p) cudaFree(p); }); +} + class ucx_perf_cuda_test_runner { public: ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index fb31396e60d..e09dd3fff1e 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -13,8 +13,6 @@ #include #include -#include - class ucp_perf_cuda_request_manager { public: @@ -217,6 +215,11 @@ public: return UCS_ERR_NO_MEMORY; } + unique_element_list_ptr element_list = create_element_list(); + if (!element_list) { + return UCS_ERR_NO_MEMORY; + } + ucp_perf_barrier(&m_perf); ucx_perf_test_start_clock(&m_perf); @@ -240,15 +243,25 @@ public: size_t length = ucx_perf_get_message_size(&m_perf.params); unsigned my_index = rte_call(&m_perf, group_index); - ucp_perf_barrier(&m_perf); - ucx_perf_test_start_clock(&m_perf); + unique_mem_list_ptr handle(nullptr, nullptr); + unique_element_list_ptr element_list(nullptr, nullptr); if (my_index == 1) { - unique_mem_list_ptr handle = create_mem_list(); + handle = create_mem_list(); if (!handle) { return UCS_ERR_NO_MEMORY; } + element_list = create_element_list(); + if (!element_list) { + return UCS_ERR_NO_MEMORY; + } + } + + ucp_perf_barrier(&m_perf); + ucx_perf_test_start_clock(&m_perf); + + if (my_index == 1) { unsigned thread_count = m_perf.params.device_thread_count; ucp_perf_cuda_put_multi_bw_kernel <<<1, thread_count>>>( @@ -274,19 +287,22 @@ private: unique_mem_list_ptr create_mem_list() const { - ucp_device_mem_list_elem_t elem; - elem.field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | - UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY; - elem.memh = m_perf.ucp.send_memh; - elem.rkey = m_perf.ucp.rkey; + size_t count = m_perf.params.msg_size_cnt; + ucp_device_mem_list_elem_t elems[count]; + for (size_t i = 0; i < count; ++i) { + elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY; + elems[i].memh = m_perf.ucp.send_memh; + elems[i].rkey = m_perf.ucp.rkey; + } ucp_device_mem_list_params_t params; params.field_mask = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS | UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE | UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS; params.element_size = sizeof(ucp_device_mem_list_elem_t); - params.num_elements = 1; - params.elements = &elem; + params.num_elements = count; + params.elements = elems; ucp_device_mem_list_handle_h mem_list; ucs_status_t status = ucp_device_mem_list_create(m_perf.ucp.ep, ¶ms, @@ -297,6 +313,33 @@ private: return unique_mem_list_ptr(mem_list, ucp_device_mem_list_release); } + + using unique_element_list_ptr = + ucx_perf_cuda_unique_ptr; + + unique_element_list_ptr create_element_list() const + { + size_t count = m_perf.params.msg_size_cnt; + unique_element_list_ptr list = + ucx_perf_cuda_make_unique( + sizeof(ucx_perf_cuda_element_list) + + (count * sizeof(ucx_perf_cuda_element))); + + size_t offset = 0; + for (unsigned i = 0; i < count; ++i) { + ucx_perf_cuda_element elem = { + .index = i, + .address = (char *)m_perf.send_buffer + offset, + .remote_address = m_perf.ucp.remote_addr + offset, + .length = m_perf.params.msg_size_list[i] + }; + offset += elem.length; + CUDA_CALL_ERR(cudaMemcpy, &list->elements[i], &elem, sizeof(elem), + cudaMemcpyHostToDevice); + } + + return list; + } }; ucx_perf_device_dispatcher_t ucx_perf_cuda_dispatcher; From 5a59ee2ac6a07ef213df2691c5312a8976803729 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 09:40:32 +0000 Subject: [PATCH 06/32] UCP/PERF: Kernel dispatch macro --- src/tools/perf/cuda/cuda_kernel.cuh | 43 ++++++++++++++++++++++++++ src/tools/perf/cuda/ucp_cuda_kernel.cu | 25 +++++++-------- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 8c7bfae2d60..cff1f263cdd 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -112,6 +112,49 @@ ucx_perf_cuda_unique_ptr ucx_perf_cuda_make_unique(size_t size) { return ucx_perf_cuda_unique_ptr(raw, [](T* p){ if (p) cudaFree(p); }); } +#define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \ + do { \ + switch (cmd) { \ + case UCX_PERF_CMD_PUT_SINGLE: \ + func<<>>(__VA_ARGS__); \ + break; \ + case UCX_PERF_CMD_PUT_MULTI: \ + func<<>>(__VA_ARGS__); \ + break; \ + case UCX_PERF_CMD_PUT_PARTIAL: \ + func<<>>(__VA_ARGS__); \ + break; \ + default: \ + ucs_error("Unsupported cmd: %d", cmd); \ + break; \ + } \ + } while (0) + +#define UCX_KERNEL_DISPATCH(perf, func, ...) \ + do { \ + ucs_device_level_t _level = perf.params.device_level; \ + ucx_perf_cmd_t _cmd = perf.params.command; \ + unsigned _blocks = perf.params.device_block_count; \ + unsigned _threads = perf.params.device_thread_count; \ + switch (_level) { \ + case UCS_DEVICE_LEVEL_THREAD: \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + break; \ + case UCS_DEVICE_LEVEL_WARP: \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + break; \ + case UCS_DEVICE_LEVEL_BLOCK: \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + break; \ + case UCS_DEVICE_LEVEL_GRID: \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + break; \ + default: \ + ucs_error("Unsupported level: %d", _level); \ + break; \ + } \ + } while (0) + class ucx_perf_cuda_test_runner { public: ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index e09dd3fff1e..089ebb259b7 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -71,7 +71,7 @@ private: uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)]; }; -template +template __global__ void ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, ucp_device_mem_list_handle_h mem_list, @@ -149,7 +149,7 @@ ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list, return status; } -template +template __global__ void ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ucp_device_mem_list_handle_h mem_list, @@ -206,9 +206,8 @@ public: ucs_status_t run_pingpong() { - size_t length = ucx_perf_get_message_size(&m_perf.params); - unsigned thread_count = m_perf.params.device_thread_count; - unsigned my_index = rte_call(&m_perf, group_index); + size_t length = ucx_perf_get_message_size(&m_perf.params); + unsigned my_index = rte_call(&m_perf, group_index); unique_mem_list_ptr handle = create_mem_list(); if (!handle) { @@ -223,10 +222,10 @@ public: ucp_perf_barrier(&m_perf); ucx_perf_test_start_clock(&m_perf); - ucp_perf_cuda_put_multi_latency_kernel - <<<1, thread_count>>>( - gpu_ctx(), handle.get(), 0, m_perf.send_buffer, - m_perf.ucp.remote_addr, length, m_perf.recv_buffer, my_index); + UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel, + gpu_ctx(), handle.get(), 0, m_perf.send_buffer, + m_perf.ucp.remote_addr, length, m_perf.recv_buffer, + my_index); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(length); @@ -262,11 +261,9 @@ public: ucx_perf_test_start_clock(&m_perf); if (my_index == 1) { - unsigned thread_count = m_perf.params.device_thread_count; - ucp_perf_cuda_put_multi_bw_kernel - <<<1, thread_count>>>( - gpu_ctx(), handle.get(), 0, m_perf.send_buffer, - m_perf.ucp.remote_addr, length); + UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel, + gpu_ctx(), handle.get(), 0, m_perf.send_buffer, + m_perf.ucp.remote_addr, length); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(length); } else if (my_index == 0) { From 0fcc18c22fc51845d311f2a59d56ec09f4c66a26 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 09:45:33 +0000 Subject: [PATCH 07/32] UCP/PERF: Fixed coverity warning --- test/gtest/common/test_perf.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/gtest/common/test_perf.cc b/test/gtest/common/test_perf.cc index 01d36c4ae74..93d3171311d 100644 --- a/test/gtest/common/test_perf.cc +++ b/test/gtest/common/test_perf.cc @@ -217,6 +217,8 @@ void test_perf::test_params_init(const test_spec &test, params.send_device = {UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT}; params.recv_device = {UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT}; params.device_thread_count = 1; + params.device_block_count = 1; + params.device_level = UCS_DEVICE_LEVEL_THREAD; params.percentile_rank = 50.0; memset(params.uct.md_name, 0, sizeof(params.uct.md_name)); From 50fbf7935d1eed97cd240ed1579c4653126eb4d8 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 12:00:05 +0000 Subject: [PATCH 08/32] UCP/PERF: Fixed build error --- src/ucs/sys/device_code.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index d1b5e5dff89..768a082ca65 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -7,6 +7,7 @@ #ifndef UCS_DEVICE_CODE_H #define UCS_DEVICE_CODE_H +#include #include /* From d99de4be48c29fda7f2fb86145c80874034dc31f Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 12:17:26 +0000 Subject: [PATCH 09/32] UCP/PERF: Common send function for all APIs --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 117 ++++++++++++++----------- 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 089ebb259b7..97e981c90db 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -71,13 +71,55 @@ private: uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)]; }; +template +UCS_F_DEVICE ucs_status_t +ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list, + ucx_perf_cuda_element_list &element_list, + ucp_device_request_t &req) +{ + switch (cmd) { + case UCX_PERF_CMD_PUT_SINGLE: { + ucx_perf_cuda_element &elem = element_list.elements[0]; + return ucp_device_put_single(mem_list, elem.index, + elem.address, elem.remote_address, + elem.length, 0, &req); + } + // case UCX_PERF_CMD_PUT_MULTI: + // return ucp_device_put_multi(mem_list, element_list->elements, + // element_list->count, 0, &req); + // case UCX_PERF_CMD_PUT_PARTIAL: + // return ucp_device_put_partial(mem_list, element_list->elements, + // element_list->count, 0, &req); + } + + return UCS_ERR_INVALID_PARAM; +} + +template +UCS_F_DEVICE ucs_status_t +ucp_perf_cuda_send_sync(ucp_device_mem_list_handle_h mem_list, + ucx_perf_cuda_element_list &element_list) +{ + ucp_device_request_t req; + ucs_status_t status = ucp_perf_cuda_send_nbx(mem_list, + element_list, req); + if (status != UCS_OK) { + return status; + } + + do { + status = ucp_device_progress_req(&req); + } while (status == UCS_INPROGRESS); + + return status; +} + template __global__ void ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, ucp_device_mem_list_handle_h mem_list, - unsigned mem_list_index, - const void *address, uint64_t remote_address, - size_t length) + ucx_perf_cuda_element_list &element_list, + const void *address, size_t length) { ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); ucx_perf_counter_t max_iters = ctx.max_iters; @@ -95,8 +137,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, *sn = idx + 1; ucp_device_request_t &req = request_mgr.get_request(); - status = ucp_device_put_single(mem_list, mem_list_index, address, - remote_address, length, 0, &req); + status = ucp_perf_cuda_send_nbx(mem_list, element_list, req); if (status != UCS_OK) { break; } @@ -115,47 +156,12 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, ctx.status = status; } -__global__ void -ucp_perf_cuda_wait_multi_bw_kernel(ucx_perf_cuda_context &ctx, - const void *address, size_t length) -{ - volatile uint64_t *sn = ucx_perf_cuda_get_sn(address, length); - while (*sn < ctx.max_iters) { - __nanosleep(100000); // 100us - } - - ctx.status = UCS_OK; -} - -template -UCS_F_DEVICE ucs_status_t -ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list, - unsigned mem_list_index, const void *address, - uint64_t remote_address, size_t length) -{ - ucp_device_request_t req; - ucs_status_t status; - - status = ucp_device_put_single(mem_list, mem_list_index, address, - remote_address, length, 0, &req); - if (status != UCS_OK) { - return status; - } - - do { - status = ucp_device_progress_req(&req); - } while (status == UCS_INPROGRESS); - - return status; -} - template __global__ void ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ucp_device_mem_list_handle_h mem_list, - unsigned mem_list_index, + ucx_perf_cuda_element_list &element_list, const void *address, - uint64_t remote_address, size_t length, const void *recv_address, bool is_sender) { @@ -168,9 +174,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { if (is_sender) { *sn = idx + 1; - status = ucp_perf_cuda_put_single(mem_list, mem_list_index, - address, remote_address, - length); + status = ucp_perf_cuda_send_sync(mem_list, element_list); if (status != UCS_OK) { break; } @@ -178,9 +182,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, } else { ucx_perf_cuda_wait_sn(recv_sn, idx + 1); *sn = idx + 1; - status = ucp_perf_cuda_put_single(mem_list, mem_list_index, - address, remote_address, - length); + status = ucp_perf_cuda_send_sync(mem_list, element_list); if (status != UCS_OK) { break; } @@ -193,6 +195,18 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ctx.status = status; } +__global__ void +ucp_perf_cuda_wait_multi_bw_kernel(ucx_perf_cuda_context &ctx, + const void *address, size_t length) +{ + volatile uint64_t *sn = ucx_perf_cuda_get_sn(address, length); + while (*sn < ctx.max_iters) { + __nanosleep(100000); // 100us + } + + ctx.status = UCS_OK; +} + class ucp_perf_cuda_test_runner : public ucx_perf_cuda_test_runner { public: ucp_perf_cuda_test_runner(ucx_perf_context_t &perf) : @@ -223,9 +237,8 @@ public: ucx_perf_test_start_clock(&m_perf); UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel, - gpu_ctx(), handle.get(), 0, m_perf.send_buffer, - m_perf.ucp.remote_addr, length, m_perf.recv_buffer, - my_index); + gpu_ctx(), handle.get(), *element_list, m_perf.send_buffer, + length, m_perf.recv_buffer, my_index); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(length); @@ -262,8 +275,8 @@ public: if (my_index == 1) { UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel, - gpu_ctx(), handle.get(), 0, m_perf.send_buffer, - m_perf.ucp.remote_addr, length); + gpu_ctx(), handle.get(), *element_list, + m_perf.send_buffer, length); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(length); } else if (my_index == 0) { From 7e7fd34d439d2b671cc16442004c2b5d26b22c0e Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Wed, 17 Sep 2025 13:18:53 +0000 Subject: [PATCH 10/32] UCP/PERF: Common element list for all APIs --- src/tools/perf/cuda/cuda_kernel.cuh | 38 ++++++++-------- src/tools/perf/cuda/ucp_cuda_kernel.cu | 60 ++++++++++++++------------ 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index cff1f263cdd..4291c1d6ef6 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -90,27 +90,29 @@ UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) return bits; } -struct ucx_perf_cuda_element { - unsigned index; - void *address; - uint64_t remote_address; - size_t length; -}; - struct ucx_perf_cuda_element_list { - size_t count; - ucx_perf_cuda_element elements[0]; -}; + size_t m_count; + unsigned *m_indices; + void **m_addresses; + uint64_t *m_remote_addresses; + size_t *m_lengths; -template -using ucx_perf_cuda_unique_ptr = std::unique_ptr; + ucx_perf_cuda_element_list(size_t count) : m_count(count) + { + CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_indices, count * sizeof(unsigned)); + CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_addresses, count * sizeof(void*)); + CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_remote_addresses, count * sizeof(uint64_t)); + CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_lengths, count * sizeof(size_t)); + } -template -ucx_perf_cuda_unique_ptr ucx_perf_cuda_make_unique(size_t size) { - T* raw = nullptr; - CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &raw, size); - return ucx_perf_cuda_unique_ptr(raw, [](T* p){ if (p) cudaFree(p); }); -} + ~ucx_perf_cuda_element_list() + { + CUDA_CALL_WARN(cudaFree, m_indices); + CUDA_CALL_WARN(cudaFree, m_addresses); + CUDA_CALL_WARN(cudaFree, m_remote_addresses); + CUDA_CALL_WARN(cudaFree, m_lengths); + } +}; #define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \ do { \ diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 97e981c90db..c803ae6aba4 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -13,6 +13,8 @@ #include #include +#include + class ucp_perf_cuda_request_manager { public: @@ -78,12 +80,11 @@ ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list, ucp_device_request_t &req) { switch (cmd) { - case UCX_PERF_CMD_PUT_SINGLE: { - ucx_perf_cuda_element &elem = element_list.elements[0]; - return ucp_device_put_single(mem_list, elem.index, - elem.address, elem.remote_address, - elem.length, 0, &req); - } + case UCX_PERF_CMD_PUT_SINGLE: + return ucp_device_put_single(mem_list, element_list.m_indices[0], + element_list.m_addresses[0], + element_list.m_remote_addresses[0], + element_list.m_lengths[0], 0, &req); // case UCX_PERF_CMD_PUT_MULTI: // return ucp_device_put_multi(mem_list, element_list->elements, // element_list->count, 0, &req); @@ -256,7 +257,7 @@ public: unsigned my_index = rte_call(&m_perf, group_index); unique_mem_list_ptr handle(nullptr, nullptr); - unique_element_list_ptr element_list(nullptr, nullptr); + unique_element_list_ptr element_list; if (my_index == 1) { handle = create_mem_list(); @@ -324,31 +325,36 @@ private: return unique_mem_list_ptr(mem_list, ucp_device_mem_list_release); } - using unique_element_list_ptr = - ucx_perf_cuda_unique_ptr; + using unique_element_list_ptr = std::unique_ptr; unique_element_list_ptr create_element_list() const { - size_t count = m_perf.params.msg_size_cnt; - unique_element_list_ptr list = - ucx_perf_cuda_make_unique( - sizeof(ucx_perf_cuda_element_list) + - (count * sizeof(ucx_perf_cuda_element))); - - size_t offset = 0; - for (unsigned i = 0; i < count; ++i) { - ucx_perf_cuda_element elem = { - .index = i, - .address = (char *)m_perf.send_buffer + offset, - .remote_address = m_perf.ucp.remote_addr + offset, - .length = m_perf.params.msg_size_list[i] - }; - offset += elem.length; - CUDA_CALL_ERR(cudaMemcpy, &list->elements[i], &elem, sizeof(elem), - cudaMemcpyHostToDevice); + size_t count = m_perf.params.msg_size_cnt; + unique_element_list_ptr element_list = + unique_element_list_ptr(new ucx_perf_cuda_element_list(count)); + + std::vector indices(count); + std::vector addresses(count); + std::vector remote_addresses(count); + std::vector lengths(count); + for (unsigned i = 0, offset = 0; i < count; ++i) { + indices[i] = i; + addresses[i] = (char *)m_perf.send_buffer + offset; + remote_addresses[i] = m_perf.ucp.remote_addr + offset; + lengths[i] = m_perf.params.msg_size_list[i]; + offset += lengths[i]; } - return list; + CUDA_CALL_ERR(cudaMemcpy, element_list->m_indices, indices.data(), + count * sizeof(unsigned), cudaMemcpyHostToDevice); + CUDA_CALL_ERR(cudaMemcpy, element_list->m_addresses, addresses.data(), + count * sizeof(void*), cudaMemcpyHostToDevice); + CUDA_CALL_ERR(cudaMemcpy, element_list->m_remote_addresses, + remote_addresses.data(), count * sizeof(uint64_t), + cudaMemcpyHostToDevice); + CUDA_CALL_ERR(cudaMemcpy, element_list->m_lengths, lengths.data(), + count * sizeof(size_t), cudaMemcpyHostToDevice); + return element_list; } }; From 0733535fa373a52482577b18a8c107a11b1df61b Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 07:47:58 +0000 Subject: [PATCH 11/32] UCP/PERF: Common params for all APIs --- src/tools/perf/cuda/cuda_kernel.cuh | 24 --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 216 ++++++++++++------------- 2 files changed, 106 insertions(+), 134 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 4291c1d6ef6..f24a4ac568b 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -90,30 +90,6 @@ UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) return bits; } -struct ucx_perf_cuda_element_list { - size_t m_count; - unsigned *m_indices; - void **m_addresses; - uint64_t *m_remote_addresses; - size_t *m_lengths; - - ucx_perf_cuda_element_list(size_t count) : m_count(count) - { - CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_indices, count * sizeof(unsigned)); - CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_addresses, count * sizeof(void*)); - CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_remote_addresses, count * sizeof(uint64_t)); - CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_lengths, count * sizeof(size_t)); - } - - ~ucx_perf_cuda_element_list() - { - CUDA_CALL_WARN(cudaFree, m_indices); - CUDA_CALL_WARN(cudaFree, m_addresses); - CUDA_CALL_WARN(cudaFree, m_remote_addresses); - CUDA_CALL_WARN(cudaFree, m_lengths); - } -}; - #define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \ do { \ switch (cmd) { \ diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index c803ae6aba4..0476262f892 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -73,18 +73,106 @@ private: uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)]; }; +struct ucp_perf_cuda_params { + ucp_device_mem_list_handle_h mem_list; + unsigned *indices; + void **addresses; + uint64_t *remote_addresses; + size_t *lengths; + ucp_device_flags_t flags; +}; + +class ucp_perf_cuda_params_handler { +public: + ucp_perf_cuda_params_handler(const ucx_perf_context_t &perf) + { + init_mem_list(perf); + init_elements(perf); + m_params.flags = UCP_DEVICE_FLAG_NODELAY; + } + + ~ucp_perf_cuda_params_handler() + { + ucp_device_mem_list_release(m_params.mem_list); + CUDA_CALL_WARN(cudaFree, m_params.indices); + CUDA_CALL_WARN(cudaFree, m_params.addresses); + CUDA_CALL_WARN(cudaFree, m_params.remote_addresses); + CUDA_CALL_WARN(cudaFree, m_params.lengths); + } + + const ucp_perf_cuda_params &get_params() const { return m_params; } + +private: + void init_mem_list(const ucx_perf_context_t &perf) + { + size_t count = perf.params.msg_size_cnt; + ucp_device_mem_list_elem_t elems[count]; + for (size_t i = 0; i < count; ++i) { + elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY; + elems[i].memh = perf.ucp.send_memh; + elems[i].rkey = perf.ucp.rkey; + } + + ucp_device_mem_list_params_t params; + params.field_mask = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS | + UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE | + UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS; + params.element_size = sizeof(ucp_device_mem_list_elem_t); + params.num_elements = count; + params.elements = elems; + + ucs_status_t status = ucp_device_mem_list_create(perf.ucp.ep, ¶ms, + &m_params.mem_list); + if (status != UCS_OK) { + throw std::runtime_error("Failed to create memory list"); + } + } + + void init_elements(const ucx_perf_context_t &perf) + { + size_t count = perf.params.msg_size_cnt; + + std::vector indices(count); + std::vector addresses(count); + std::vector remote_addresses(count); + std::vector lengths(count); + for (unsigned i = 0, offset = 0; i < count; ++i) { + indices[i] = i; + addresses[i] = (char *)perf.send_buffer + offset; + remote_addresses[i] = perf.ucp.remote_addr + offset; + lengths[i] = perf.params.msg_size_list[i]; + offset += lengths[i]; + } + + device_clone(&m_params.indices, indices.data(), count); + device_clone(&m_params.addresses, addresses.data(), count); + device_clone(&m_params.remote_addresses, remote_addresses.data(), count); + device_clone(&m_params.lengths, lengths.data(), count); + } + + template + void device_clone(T **dst, const T *src, size_t count) + { + CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, dst, count * sizeof(T)); + CUDA_CALL_ERR(cudaMemcpy, *dst, src, count * sizeof(T), + cudaMemcpyHostToDevice); + } + + ucp_perf_cuda_params m_params; +}; + template UCS_F_DEVICE ucs_status_t -ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list, - ucx_perf_cuda_element_list &element_list, - ucp_device_request_t &req) +ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucp_device_request_t &req) { switch (cmd) { case UCX_PERF_CMD_PUT_SINGLE: - return ucp_device_put_single(mem_list, element_list.m_indices[0], - element_list.m_addresses[0], - element_list.m_remote_addresses[0], - element_list.m_lengths[0], 0, &req); + return ucp_device_put_single(params.mem_list, params.indices[0], + params.addresses[0], + params.remote_addresses[0], + params.lengths[0], params.flags, + &req); // case UCX_PERF_CMD_PUT_MULTI: // return ucp_device_put_multi(mem_list, element_list->elements, // element_list->count, 0, &req); @@ -98,12 +186,10 @@ ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list, template UCS_F_DEVICE ucs_status_t -ucp_perf_cuda_send_sync(ucp_device_mem_list_handle_h mem_list, - ucx_perf_cuda_element_list &element_list) +ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms) { ucp_device_request_t req; - ucs_status_t status = ucp_perf_cuda_send_nbx(mem_list, - element_list, req); + ucs_status_t status = ucp_perf_cuda_send_nbx(params, req); if (status != UCS_OK) { return status; } @@ -118,8 +204,7 @@ ucp_perf_cuda_send_sync(ucp_device_mem_list_handle_h mem_list, template __global__ void ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, - ucp_device_mem_list_handle_h mem_list, - ucx_perf_cuda_element_list &element_list, + ucp_perf_cuda_params params, const void *address, size_t length) { ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); @@ -138,7 +223,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, *sn = idx + 1; ucp_device_request_t &req = request_mgr.get_request(); - status = ucp_perf_cuda_send_nbx(mem_list, element_list, req); + status = ucp_perf_cuda_send_nbx(params, req); if (status != UCS_OK) { break; } @@ -160,8 +245,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, template __global__ void ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, - ucp_device_mem_list_handle_h mem_list, - ucx_perf_cuda_element_list &element_list, + ucp_perf_cuda_params params, const void *address, size_t length, const void *recv_address, bool is_sender) @@ -175,7 +259,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { if (is_sender) { *sn = idx + 1; - status = ucp_perf_cuda_send_sync(mem_list, element_list); + status = ucp_perf_cuda_send_sync(params); if (status != UCS_OK) { break; } @@ -183,7 +267,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, } else { ucx_perf_cuda_wait_sn(recv_sn, idx + 1); *sn = idx + 1; - status = ucp_perf_cuda_send_sync(mem_list, element_list); + status = ucp_perf_cuda_send_sync(params); if (status != UCS_OK) { break; } @@ -223,22 +307,13 @@ public: { size_t length = ucx_perf_get_message_size(&m_perf.params); unsigned my_index = rte_call(&m_perf, group_index); - - unique_mem_list_ptr handle = create_mem_list(); - if (!handle) { - return UCS_ERR_NO_MEMORY; - } - - unique_element_list_ptr element_list = create_element_list(); - if (!element_list) { - return UCS_ERR_NO_MEMORY; - } + ucp_perf_cuda_params_handler params_handler(m_perf); ucp_perf_barrier(&m_perf); ucx_perf_test_start_clock(&m_perf); UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel, - gpu_ctx(), handle.get(), *element_list, m_perf.send_buffer, + gpu_ctx(), params_handler.get_params(), m_perf.send_buffer, length, m_perf.recv_buffer, my_index); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); @@ -256,27 +331,14 @@ public: size_t length = ucx_perf_get_message_size(&m_perf.params); unsigned my_index = rte_call(&m_perf, group_index); - unique_mem_list_ptr handle(nullptr, nullptr); - unique_element_list_ptr element_list; - - if (my_index == 1) { - handle = create_mem_list(); - if (!handle) { - return UCS_ERR_NO_MEMORY; - } - - element_list = create_element_list(); - if (!element_list) { - return UCS_ERR_NO_MEMORY; - } - } + ucp_perf_cuda_params_handler params_handler(m_perf); ucp_perf_barrier(&m_perf); ucx_perf_test_start_clock(&m_perf); if (my_index == 1) { UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel, - gpu_ctx(), handle.get(), *element_list, + gpu_ctx(), params_handler.get_params(), m_perf.send_buffer, length); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(length); @@ -290,72 +352,6 @@ public: ucp_perf_barrier(&m_perf); return UCS_OK; } - -private: - using unique_mem_list_ptr = - std::unique_ptr; - - unique_mem_list_ptr create_mem_list() const - { - size_t count = m_perf.params.msg_size_cnt; - ucp_device_mem_list_elem_t elems[count]; - for (size_t i = 0; i < count; ++i) { - elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | - UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY; - elems[i].memh = m_perf.ucp.send_memh; - elems[i].rkey = m_perf.ucp.rkey; - } - - ucp_device_mem_list_params_t params; - params.field_mask = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS | - UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE | - UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS; - params.element_size = sizeof(ucp_device_mem_list_elem_t); - params.num_elements = count; - params.elements = elems; - - ucp_device_mem_list_handle_h mem_list; - ucs_status_t status = ucp_device_mem_list_create(m_perf.ucp.ep, ¶ms, - &mem_list); - if (status != UCS_OK) { - return unique_mem_list_ptr(nullptr, nullptr); - } - - return unique_mem_list_ptr(mem_list, ucp_device_mem_list_release); - } - - using unique_element_list_ptr = std::unique_ptr; - - unique_element_list_ptr create_element_list() const - { - size_t count = m_perf.params.msg_size_cnt; - unique_element_list_ptr element_list = - unique_element_list_ptr(new ucx_perf_cuda_element_list(count)); - - std::vector indices(count); - std::vector addresses(count); - std::vector remote_addresses(count); - std::vector lengths(count); - for (unsigned i = 0, offset = 0; i < count; ++i) { - indices[i] = i; - addresses[i] = (char *)m_perf.send_buffer + offset; - remote_addresses[i] = m_perf.ucp.remote_addr + offset; - lengths[i] = m_perf.params.msg_size_list[i]; - offset += lengths[i]; - } - - CUDA_CALL_ERR(cudaMemcpy, element_list->m_indices, indices.data(), - count * sizeof(unsigned), cudaMemcpyHostToDevice); - CUDA_CALL_ERR(cudaMemcpy, element_list->m_addresses, addresses.data(), - count * sizeof(void*), cudaMemcpyHostToDevice); - CUDA_CALL_ERR(cudaMemcpy, element_list->m_remote_addresses, - remote_addresses.data(), count * sizeof(uint64_t), - cudaMemcpyHostToDevice); - CUDA_CALL_ERR(cudaMemcpy, element_list->m_lengths, lengths.data(), - count * sizeof(size_t), cudaMemcpyHostToDevice); - return element_list; - } }; ucx_perf_device_dispatcher_t ucx_perf_cuda_dispatcher; From 74f4b692484476f6e9085e06f10c2e50ad1cf3bb Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 08:02:09 +0000 Subject: [PATCH 12/32] UCP/PERF: Report kernel status --- src/tools/perf/cuda/cuda_kernel.cuh | 7 ++----- src/tools/perf/cuda/ucp_cuda_kernel.cu | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index f24a4ac568b..07b4412814d 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -153,8 +153,6 @@ public: CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx); } - ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; } - void wait_for_kernel(size_t msg_length) { ucx_perf_counter_t last_completed = 0; @@ -176,6 +174,8 @@ public: protected: ucx_perf_context_t &m_perf; + ucx_perf_cuda_context *m_cpu_ctx; + ucx_perf_cuda_context *m_gpu_ctx; private: void init_ctx() @@ -185,9 +185,6 @@ private: CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer, &m_gpu_ctx, m_cpu_ctx, 0); } - - ucx_perf_cuda_context *m_cpu_ctx; - ucx_perf_cuda_context *m_gpu_ctx; }; diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 0476262f892..72017b47757 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -204,12 +204,10 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms) template __global__ void ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, - ucp_perf_cuda_params params, - const void *address, size_t length) + ucp_perf_cuda_params params) { ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); ucx_perf_counter_t max_iters = ctx.max_iters; - uint64_t *sn = ucx_perf_cuda_get_sn(address, length); ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding); ucs_status_t status; @@ -221,7 +219,13 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, } } - *sn = idx + 1; + /* TODO: Change to ucp_device_counter_write */ + if ((cmd == UCX_PERF_CMD_PUT_SINGLE) && (idx == max_iters - 1)) { + uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], + params.lengths[0]); + *sn = idx + 1; + } + ucp_device_request_t &req = request_mgr.get_request(); status = ucp_perf_cuda_send_nbx(params, req); if (status != UCS_OK) { @@ -313,7 +317,7 @@ public: ucx_perf_test_start_clock(&m_perf); UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel, - gpu_ctx(), params_handler.get_params(), m_perf.send_buffer, + *m_gpu_ctx, params_handler.get_params(), m_perf.send_buffer, length, m_perf.recv_buffer, my_index); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); @@ -323,7 +327,7 @@ public: ucx_perf_get_time(&m_perf); ucp_perf_barrier(&m_perf); - return UCS_OK; + return m_cpu_ctx->status; } ucs_status_t run_stream_uni() @@ -338,19 +342,18 @@ public: if (my_index == 1) { UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel, - gpu_ctx(), params_handler.get_params(), - m_perf.send_buffer, length); + *m_gpu_ctx, params_handler.get_params()); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(length); } else if (my_index == 0) { ucp_perf_cuda_wait_multi_bw_kernel<<<1, 1>>>( - gpu_ctx(), m_perf.recv_buffer, length); + *m_gpu_ctx, m_perf.recv_buffer, length); } CUDA_CALL_RET(UCS_ERR_IO_ERROR, cudaDeviceSynchronize); ucx_perf_get_time(&m_perf); ucp_perf_barrier(&m_perf); - return UCS_OK; + return m_cpu_ctx->status; } }; From 4015d74d203d3e7e1ee7fbc90a07baed35df8488 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 08:22:16 +0000 Subject: [PATCH 13/32] UCP/PERF: Reduce number of kernel args --- src/tools/perf/cuda/cuda_kernel.cuh | 3 +- src/tools/perf/cuda/ucp_cuda_kernel.cu | 41 ++++++++++++-------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 07b4412814d..6c6e192d3ab 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -153,8 +153,9 @@ public: CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx); } - void wait_for_kernel(size_t msg_length) + void wait_for_kernel() { + size_t msg_length = ucx_perf_get_message_size(&m_perf.params); ucx_perf_counter_t last_completed = 0; ucx_perf_counter_t completed = m_cpu_ctx->completed_iters; while (1) { diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 72017b47757..e64ee5fdc5f 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -164,15 +164,20 @@ private: template UCS_F_DEVICE ucs_status_t -ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucp_device_request_t &req) +ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, + ucp_device_request_t &req) { switch (cmd) { - case UCX_PERF_CMD_PUT_SINGLE: + case UCX_PERF_CMD_PUT_SINGLE: { + /* TODO: Change to ucp_device_counter_write */ + uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], params.lengths[0]); + *sn = idx + 1; return ucp_device_put_single(params.mem_list, params.indices[0], params.addresses[0], params.remote_addresses[0], params.lengths[0], params.flags, &req); + } // case UCX_PERF_CMD_PUT_MULTI: // return ucp_device_put_multi(mem_list, element_list->elements, // element_list->count, 0, &req); @@ -186,10 +191,10 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucp_device_request_t &req) template UCS_F_DEVICE ucs_status_t -ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms) +ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx) { ucp_device_request_t req; - ucs_status_t status = ucp_perf_cuda_send_nbx(params, req); + ucs_status_t status = ucp_perf_cuda_send_nbx(params, idx, req); if (status != UCS_OK) { return status; } @@ -219,15 +224,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, } } - /* TODO: Change to ucp_device_counter_write */ - if ((cmd == UCX_PERF_CMD_PUT_SINGLE) && (idx == max_iters - 1)) { - uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], - params.lengths[0]); - *sn = idx + 1; - } - ucp_device_request_t &req = request_mgr.get_request(); - status = ucp_perf_cuda_send_nbx(params, req); + status = ucp_perf_cuda_send_nbx(params, idx, req); if (status != UCS_OK) { break; } @@ -250,20 +248,19 @@ template __global__ void ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ucp_perf_cuda_params params, - const void *address, - size_t length, const void *recv_address, + const void *recv_address, size_t length, bool is_sender) { ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); ucx_perf_counter_t max_iters = ctx.max_iters; - uint64_t *sn = ucx_perf_cuda_get_sn(address, length); + uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], length); uint64_t *recv_sn = ucx_perf_cuda_get_sn(recv_address, length); ucs_status_t status = UCS_OK; for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { if (is_sender) { *sn = idx + 1; - status = ucp_perf_cuda_send_sync(params); + status = ucp_perf_cuda_send_sync(params, idx); if (status != UCS_OK) { break; } @@ -271,7 +268,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, } else { ucx_perf_cuda_wait_sn(recv_sn, idx + 1); *sn = idx + 1; - status = ucp_perf_cuda_send_sync(params); + status = ucp_perf_cuda_send_sync(params, idx); if (status != UCS_OK) { break; } @@ -317,11 +314,11 @@ public: ucx_perf_test_start_clock(&m_perf); UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel, - *m_gpu_ctx, params_handler.get_params(), m_perf.send_buffer, - length, m_perf.recv_buffer, my_index); + *m_gpu_ctx, params_handler.get_params(), + m_perf.recv_buffer, length, my_index); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); - wait_for_kernel(length); + wait_for_kernel(); CUDA_CALL_RET(UCS_ERR_IO_ERROR, cudaDeviceSynchronize); @@ -332,7 +329,6 @@ public: ucs_status_t run_stream_uni() { - size_t length = ucx_perf_get_message_size(&m_perf.params); unsigned my_index = rte_call(&m_perf, group_index); ucp_perf_cuda_params_handler params_handler(m_perf); @@ -344,8 +340,9 @@ public: UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel, *m_gpu_ctx, params_handler.get_params()); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); - wait_for_kernel(length); + wait_for_kernel(); } else if (my_index == 0) { + size_t length = ucx_perf_get_message_size(&m_perf.params); ucp_perf_cuda_wait_multi_bw_kernel<<<1, 1>>>( *m_gpu_ctx, m_perf.recv_buffer, length); } From 39188ba52fce384ddec31474d24f8f81c12e39db Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 11:35:53 +0000 Subject: [PATCH 14/32] UCP/PERF: Progress fix from Thomas --- src/tools/perf/cuda/cuda_kernel.cuh | 4 ++-- src/tools/perf/cuda/ucp_cuda_kernel.cu | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 6c6e192d3ab..dca1942d7ae 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -81,9 +81,9 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) { return count; } -UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) { +UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from) { for (size_t i = from; i < bits; i++) { - if (UCX_BIT_GET(set, i)) { + if (!UCX_BIT_GET(set, i)) { return i; } } diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index e64ee5fdc5f..8d690f9e0f0 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -54,7 +54,7 @@ public: __device__ ucp_device_request_t &get_request() { assert(get_pending_count() < m_size); - size_t index = ucx_bitset_ffs(m_pending, m_size, 0); + size_t index = ucx_bitset_ffns(m_pending, m_size, 0); UCX_BIT_SET(m_pending, index); return m_requests[index]; } @@ -220,15 +220,17 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, while (request_mgr.get_pending_count() >= ctx.max_outstanding) { status = request_mgr.progress(1); if (status != UCS_OK) { - break; + continue; } } ucp_device_request_t &req = request_mgr.get_request(); - status = ucp_perf_cuda_send_nbx(params, idx, req); - if (status != UCS_OK) { - break; - } + do { + status = ucp_perf_cuda_send_nbx(params, idx, req); + if (status == UCS_ERR_NO_RESOURCE) { + request_mgr.progress(1); + } + } while (status == UCS_ERR_NO_RESOURCE); ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time); __syncthreads(); From 433c48d2c434d63f27277be9fc8ec7ec5c7208e3 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 11:37:45 +0000 Subject: [PATCH 15/32] UCP/PERF: TODO comment --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 8d690f9e0f0..b32a601d7ae 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -225,6 +225,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, } ucp_device_request_t &req = request_mgr.get_request(); + // TODO: remove loop once API is changed do { status = ucp_perf_cuda_send_nbx(params, idx, req); if (status == UCS_ERR_NO_RESOURCE) { From dc25dd4b9ac78f2716d0d43396d019a20389336c Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 12:10:01 +0000 Subject: [PATCH 16/32] UCP/PERF: Counters in params --- src/tools/perf/cuda/cuda_kernel.cuh | 7 ++-- src/tools/perf/cuda/ucp_cuda_kernel.cu | 45 ++++++++++++++------------ 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index dca1942d7ae..5a31d3dac41 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -51,15 +51,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx, } } -UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length) +static UCS_F_ALWAYS_INLINE uint64_t * +ucx_perf_cuda_get_sn(const void *address, size_t length) { return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t)); } -UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value) +UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value) { if (threadIdx.x == 0) { - while (*sn < value); + while (ucs_device_atomic64_read(sn) < value); } __syncthreads(); } diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index b32a601d7ae..a0c009c9b6e 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -79,6 +79,9 @@ struct ucp_perf_cuda_params { void **addresses; uint64_t *remote_addresses; size_t *lengths; + uint64_t counter_remote; + uint64_t *counter_send; + uint64_t *counter_recv; ucp_device_flags_t flags; }; @@ -88,7 +91,7 @@ public: { init_mem_list(perf); init_elements(perf); - m_params.flags = UCP_DEVICE_FLAG_NODELAY; + init_counters(perf); } ~ucp_perf_cuda_params_handler() @@ -151,6 +154,16 @@ private: device_clone(&m_params.lengths, lengths.data(), count); } + void init_counters(const ucx_perf_context_t &perf) + { + size_t length = ucx_perf_get_message_size(&perf.params); + m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn( + (void*)perf.ucp.remote_addr, length); + m_params.counter_send = ucx_perf_cuda_get_sn(perf.send_buffer, length); + m_params.counter_recv = ucx_perf_cuda_get_sn(perf.recv_buffer, length); + m_params.flags = UCP_DEVICE_FLAG_NODELAY; + } + template void device_clone(T **dst, const T *src, size_t count) { @@ -168,16 +181,14 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, ucp_device_request_t &req) { switch (cmd) { - case UCX_PERF_CMD_PUT_SINGLE: { + case UCX_PERF_CMD_PUT_SINGLE: /* TODO: Change to ucp_device_counter_write */ - uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], params.lengths[0]); - *sn = idx + 1; + *params.counter_send = idx + 1; return ucp_device_put_single(params.mem_list, params.indices[0], params.addresses[0], params.remote_addresses[0], params.lengths[0], params.flags, &req); - } // case UCX_PERF_CMD_PUT_MULTI: // return ucp_device_put_multi(mem_list, element_list->elements, // element_list->count, 0, &req); @@ -239,7 +250,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, while (request_mgr.get_pending_count() > 0) { status = request_mgr.progress(max_iters); - if (status != UCS_OK) { + if (UCS_STATUS_IS_ERR(status)) { break; } } @@ -251,26 +262,21 @@ template __global__ void ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ucp_perf_cuda_params params, - const void *recv_address, size_t length, bool is_sender) { ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); ucx_perf_counter_t max_iters = ctx.max_iters; - uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], length); - uint64_t *recv_sn = ucx_perf_cuda_get_sn(recv_address, length); ucs_status_t status = UCS_OK; for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { if (is_sender) { - *sn = idx + 1; status = ucp_perf_cuda_send_sync(params, idx); if (status != UCS_OK) { break; } - ucx_perf_cuda_wait_sn(recv_sn, idx + 1); + ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1); } else { - ucx_perf_cuda_wait_sn(recv_sn, idx + 1); - *sn = idx + 1; + ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1); status = ucp_perf_cuda_send_sync(params, idx); if (status != UCS_OK) { break; @@ -286,9 +292,10 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, __global__ void ucp_perf_cuda_wait_multi_bw_kernel(ucx_perf_cuda_context &ctx, - const void *address, size_t length) + ucp_perf_cuda_params params) { - volatile uint64_t *sn = ucx_perf_cuda_get_sn(address, length); + // TODO: we can use ucp_device_counter_read, but it adds latency + volatile uint64_t *sn = params.counter_recv; while (*sn < ctx.max_iters) { __nanosleep(100000); // 100us } @@ -309,7 +316,6 @@ public: ucs_status_t run_pingpong() { - size_t length = ucx_perf_get_message_size(&m_perf.params); unsigned my_index = rte_call(&m_perf, group_index); ucp_perf_cuda_params_handler params_handler(m_perf); @@ -317,8 +323,7 @@ public: ucx_perf_test_start_clock(&m_perf); UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel, - *m_gpu_ctx, params_handler.get_params(), - m_perf.recv_buffer, length, my_index); + *m_gpu_ctx, params_handler.get_params(), my_index); CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(); @@ -333,7 +338,6 @@ public: ucs_status_t run_stream_uni() { unsigned my_index = rte_call(&m_perf, group_index); - ucp_perf_cuda_params_handler params_handler(m_perf); ucp_perf_barrier(&m_perf); @@ -345,9 +349,8 @@ public: CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError); wait_for_kernel(); } else if (my_index == 0) { - size_t length = ucx_perf_get_message_size(&m_perf.params); ucp_perf_cuda_wait_multi_bw_kernel<<<1, 1>>>( - *m_gpu_ctx, m_perf.recv_buffer, length); + *m_gpu_ctx, params_handler.get_params()); } CUDA_CALL_RET(UCS_ERR_IO_ERROR, cudaDeviceSynchronize); From 881d9ce95af440906b85917debc8314c943eb7e4 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 12:15:08 +0000 Subject: [PATCH 17/32] UCP/PERF: put multi --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index a0c009c9b6e..1fd8645e006 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -189,9 +189,12 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, params.remote_addresses[0], params.lengths[0], params.flags, &req); - // case UCX_PERF_CMD_PUT_MULTI: - // return ucp_device_put_multi(mem_list, element_list->elements, - // element_list->count, 0, &req); + case UCX_PERF_CMD_PUT_MULTI: + return ucp_device_put_multi(params.mem_list, params.addresses, + params.remote_addresses, + params.lengths, 1, + params.counter_remote, params.flags, + &req); // case UCX_PERF_CMD_PUT_PARTIAL: // return ucp_device_put_partial(mem_list, element_list->elements, // element_list->count, 0, &req); From 6276578dad5c1727958f444bea4ab90f12265295 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 12:28:07 +0000 Subject: [PATCH 18/32] UCP/PERF: put partial --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 13 ++++++++++--- src/ucp/api/device/ucp_device_impl.h | 1 + src/uct/api/device/uct_device_impl.h | 1 - 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 1fd8645e006..83e17283833 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -195,9 +195,16 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, params.lengths, 1, params.counter_remote, params.flags, &req); - // case UCX_PERF_CMD_PUT_PARTIAL: - // return ucp_device_put_partial(mem_list, element_list->elements, - // element_list->count, 0, &req); + case UCX_PERF_CMD_PUT_PARTIAL: + return ucp_device_put_multi_partial(params.mem_list, + params.indices, + params.mem_list->mem_list_length, + params.addresses, + params.remote_addresses, + params.lengths, + params.mem_list->mem_list_length, + 1, params.counter_remote, + params.flags, &req); } return UCS_ERR_INVALID_PARAM; diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index cad60dcd186..f82725f7576 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -307,6 +307,7 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( * @param [in] addresses Array of local addresses to send from. * @param [in] remote_addresses Array of remote addresses to send to. * @param [in] lengths Array of lengths in bytes for each send. + * @param [in] counter_index Index of remote increment descriptor. * @param [in] counter_inc_value Value of the remote increment. * @param [in] counter_remote_address Remote address to increment to. * @param [in] flags Flags to modify the function behavior. diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h index 648cab75c36..0271e923dbd 100644 --- a/src/uct/api/device/uct_device_impl.h +++ b/src/uct/api/device/uct_device_impl.h @@ -124,7 +124,6 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add( * @param [in] addresses Array of local addresses to send from. * @param [in] remote_addresses Array of remote addresses to send to. * @param [in] lengths Array of lengths in bytes for each send. - * @param [in] counter_index Index of remote increment descriptor. * @param [in] counter_inc_value Value of the remote increment. * @param [in] counter_remote_address Remote address to increment to. * @param [in] flags Flags to modify the function behavior. From e235a43a3cdce78a9d6d5245ea44ae28bb90ccd2 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 12:40:35 +0000 Subject: [PATCH 19/32] UCP/PERF: Minor changes --- src/tools/perf/cuda/cuda_kernel.cuh | 2 -- src/tools/perf/cuda/ucp_cuda_kernel.cu | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 5a31d3dac41..6f0e44f8fdf 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -13,8 +13,6 @@ #include #include -#include - typedef unsigned long long ucx_perf_cuda_time_t; diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 83e17283833..182639d1f0f 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -14,6 +14,7 @@ #include #include +#include class ucp_perf_cuda_request_manager { From b8fe11f22ea757bb02e89126defdf82f1df80f2b Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 13:06:14 +0000 Subject: [PATCH 20/32] UCP/PERF: Minor changes --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 182639d1f0f..e428a74c610 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -223,7 +223,8 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx) do { status = ucp_device_progress_req(&req); - } while (status == UCS_INPROGRESS); + // TODO: remove NO_RESOURCE + } while ((status == UCS_INPROGRESS) || (status == UCS_ERR_NO_RESOURCE)); return status; } @@ -240,9 +241,10 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { while (request_mgr.get_pending_count() >= ctx.max_outstanding) { - status = request_mgr.progress(1); - if (status != UCS_OK) { - continue; + request_mgr.progress(1); + if (UCS_STATUS_IS_ERR(status)) { + ctx.status = status; + return; } } From 28454a4beb6fca1c04d42eba81aad2f6166ec7c3 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 13:54:25 +0000 Subject: [PATCH 21/32] UCP/PERF: Separate element for counter --- src/tools/perf/cuda/cuda_kernel.cuh | 2 +- src/tools/perf/cuda/ucp_cuda_kernel.cu | 19 ++++++++++++------- src/tools/perf/lib/libperf_int.h | 1 + src/tools/perf/lib/libperf_memory.c | 6 ++++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 6f0e44f8fdf..6a6f5b7b9da 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -52,7 +52,7 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx, static UCS_F_ALWAYS_INLINE uint64_t * ucx_perf_cuda_get_sn(const void *address, size_t length) { - return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t)); + return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length); } UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index e428a74c610..db7c1d75746 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -109,7 +109,8 @@ public: private: void init_mem_list(const ucx_perf_context_t &perf) { - size_t count = perf.params.msg_size_cnt; + /* +1 for the counter */ + size_t count = perf.params.msg_size_cnt + 1; ucp_device_mem_list_elem_t elems[count]; for (size_t i = 0; i < count; ++i) { elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | @@ -135,7 +136,8 @@ private: void init_elements(const ucx_perf_context_t &perf) { - size_t count = perf.params.msg_size_cnt; + /* +1 for the counter */ + size_t count = perf.params.msg_size_cnt + 1; std::vector indices(count); std::vector addresses(count); @@ -145,7 +147,8 @@ private: indices[i] = i; addresses[i] = (char *)perf.send_buffer + offset; remote_addresses[i] = perf.ucp.remote_addr + offset; - lengths[i] = perf.params.msg_size_list[i]; + lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE : + perf.params.msg_size_list[i]; offset += lengths[i]; } @@ -196,16 +199,18 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, params.lengths, 1, params.counter_remote, params.flags, &req); - case UCX_PERF_CMD_PUT_PARTIAL: + case UCX_PERF_CMD_PUT_PARTIAL:{ + unsigned counter_index = params.mem_list->mem_list_length - 1; return ucp_device_put_multi_partial(params.mem_list, params.indices, - params.mem_list->mem_list_length, + counter_index, params.addresses, params.remote_addresses, params.lengths, - params.mem_list->mem_list_length, - 1, params.counter_remote, + counter_index, 1, + params.counter_remote, params.flags, &req); + } } return UCS_ERR_INVALID_PARAM; diff --git a/src/tools/perf/lib/libperf_int.h b/src/tools/perf/lib/libperf_int.h index 755a1c839e3..3b40ab9d9b7 100644 --- a/src/tools/perf/lib/libperf_int.h +++ b/src/tools/perf/lib/libperf_int.h @@ -30,6 +30,7 @@ BEGIN_C_DECLS #define UCT_PERF_TEST_AM_ID 5 #define ADDR_BUF_SIZE 4096 #define EXTRA_INFO_SIZE 256 +#define ONESIDED_SIGNAL_SIZE sizeof(uint64_t) #define UCX_PERF_TEST_FOREACH(perf) \ while (!ucx_perf_context_done(perf)) diff --git a/src/tools/perf/lib/libperf_memory.c b/src/tools/perf/lib/libperf_memory.c index 0b2a14c1e5f..8fb5a1bb9e5 100644 --- a/src/tools/perf/lib/libperf_memory.c +++ b/src/tools/perf/lib/libperf_memory.c @@ -112,7 +112,8 @@ ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf) } /* Allocate send buffer memory */ - status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count, + status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count + + ONESIDED_SIGNAL_SIZE, params->send_mem_type, &perf->send_buffer, &perf->ucp.send_memh); if (status != UCS_OK) { @@ -125,7 +126,8 @@ ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf) perf->ucp.recv_exported_mem.address = NULL; /* Allocate receive buffer memory */ - status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count, + status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count + + ONESIDED_SIGNAL_SIZE, params->recv_mem_type, &perf->recv_buffer, &perf->ucp.recv_memh); if (status != UCS_OK) { From 09a7201fd391d52a95f0b790835320786df0ed9e Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 14:12:30 +0000 Subject: [PATCH 22/32] UCP/PERF: Temporary fis for single, until counter write is merged --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index db7c1d75746..2e748104876 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -191,8 +191,8 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, return ucp_device_put_single(params.mem_list, params.indices[0], params.addresses[0], params.remote_addresses[0], - params.lengths[0], params.flags, - &req); + params.lengths[0] + ONESIDED_SIGNAL_SIZE, + params.flags, &req); case UCX_PERF_CMD_PUT_MULTI: return ucp_device_put_multi(params.mem_list, params.addresses, params.remote_addresses, From 99245048d9842e51332a8db34538ac6723265932 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 15:02:52 +0000 Subject: [PATCH 23/32] UCP/PERF: Fixed memory corruption --- src/tools/perf/perftest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c index 00d782e5632..7da9fea02ed 100644 --- a/src/tools/perf/perftest.c +++ b/src/tools/perf/perftest.c @@ -465,8 +465,8 @@ static ucs_status_t setup_sock_rte_p2p(struct perftest_context *ctx) if (peer_params.super.msg_size_cnt != 0) { peer_params.super.msg_size_list = - calloc(ctx->params.super.msg_size_cnt, - sizeof(*ctx->params.super.msg_size_list)); + calloc(peer_params.super.msg_size_cnt, + sizeof(*peer_params.super.msg_size_list)); if (peer_params.super.msg_size_list == NULL) { status = UCS_ERR_NO_MEMORY; goto err_close_connfd; From 24e20f86524ffcdc30276b030550efda76db9df6 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Thu, 18 Sep 2025 15:27:00 +0000 Subject: [PATCH 24/32] UCP/PERF: Send all elements in single --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 2e748104876..0f147026579 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -76,6 +76,7 @@ private: struct ucp_perf_cuda_params { ucp_device_mem_list_handle_h mem_list; + size_t length; unsigned *indices; void **addresses; uint64_t *remote_addresses; @@ -160,11 +161,14 @@ private: void init_counters(const ucx_perf_context_t &perf) { - size_t length = ucx_perf_get_message_size(&perf.params); + m_params.length = ucx_perf_get_message_size(&perf.params); m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn( - (void*)perf.ucp.remote_addr, length); - m_params.counter_send = ucx_perf_cuda_get_sn(perf.send_buffer, length); - m_params.counter_recv = ucx_perf_cuda_get_sn(perf.recv_buffer, length); + (void*)perf.ucp.remote_addr, + m_params.length); + m_params.counter_send = ucx_perf_cuda_get_sn(perf.send_buffer, + m_params.length); + m_params.counter_recv = ucx_perf_cuda_get_sn(perf.recv_buffer, + m_params.length); m_params.flags = UCP_DEVICE_FLAG_NODELAY; } @@ -191,7 +195,7 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, return ucp_device_put_single(params.mem_list, params.indices[0], params.addresses[0], params.remote_addresses[0], - params.lengths[0] + ONESIDED_SIGNAL_SIZE, + params.length + ONESIDED_SIGNAL_SIZE, params.flags, &req); case UCX_PERF_CMD_PUT_MULTI: return ucp_device_put_multi(params.mem_list, params.addresses, From 8053fdd84149f77c996dcef373e64bcb6aac3c2d Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Fri, 19 Sep 2025 06:52:31 +0000 Subject: [PATCH 25/32] UCP/PERF: Precise doc on blocks param --- src/tools/perf/perftest_params.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c index 6c3210d645e..e24a0be5507 100644 --- a/src/tools/perf/perftest_params.c +++ b/src/tools/perf/perftest_params.c @@ -138,9 +138,11 @@ static void usage(const struct perftest_context *ctx, const char *program) printf("\n"); printf(" UCP only:\n"); printf(" -T [:]\n"); - printf(" number of threads in the test (%d)\n", + printf(" number of threads in the test (%d).\n", ctx->params.super.thread_count); printf(" blocks is optional, it corresponds to the number of device blocks\n"); + printf(" if blocks is specified, then threads value corresponds to the number\n"); + printf(" of device threads in each block\n"); printf(" -M thread support level for progress engine (single)\n"); printf(" single - only the master thread can access\n"); printf(" serialized - one thread can access at a time\n"); From cd48e9e13a744769f3d48c8287fb8ff56c2b434f Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Fri, 19 Sep 2025 11:26:49 +0000 Subject: [PATCH 26/32] UCP/PERF: Exclude multi-thread tests from CI, add API tests --- contrib/ucx_perftest_config/test_types_ucp_device_cuda | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda index 3d73617525c..e984ea2b830 100644 --- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda +++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda @@ -1,7 +1,9 @@ # # UCP basic device cuda tests # -ucp_device_cuda_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 128 -ucp_device_cuda_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 128 +ucp_device_cuda_single_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 +ucp_device_cuda_single_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 +ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 +ucp_device_cuda_multi_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 +ucp_device_cuda_partial_bw_1k_1thread -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 +ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 From a01882335c2077dde47aa0107abcabbfd62cb880 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Mon, 22 Sep 2025 08:32:15 +0000 Subject: [PATCH 27/32] UCP/PERF: Fixed ucp_device_progress_req --- src/tools/perf/cuda/ucp_cuda_kernel.cu | 4 +++- src/ucp/api/device/ucp_device_impl.h | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 0f147026579..44d96585d98 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -250,8 +250,9 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { while (request_mgr.get_pending_count() >= ctx.max_outstanding) { - request_mgr.progress(1); + status = request_mgr.progress(1); if (UCS_STATUS_IS_ERR(status)) { + ucs_device_error("progress failed: %d", status); ctx.status = status; return; } @@ -273,6 +274,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, while (request_mgr.get_pending_count() > 0) { status = request_mgr.progress(max_iters); if (UCS_STATUS_IS_ERR(status)) { + ucs_device_error("progress failed: %d", status); break; } } diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index f82725f7576..08050f9ef3a 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -391,7 +391,12 @@ UCS_F_DEVICE ucs_status_t ucp_device_progress_req(ucp_device_request_t *req) } status = uct_device_ep_progress(req->device_ep); - return (status != UCS_OK ? status : UCS_INPROGRESS); + if (status != UCS_OK) { + return status; + } + + return (ucs_likely(req->comp.count == 0)) ? req->comp.status : + UCS_INPROGRESS; } #endif /* UCP_DEVICE_IMPL_H */ From 82b253d290fd767ab0aaae5f247230b2a1c48ca6 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Mon, 22 Sep 2025 10:05:33 +0000 Subject: [PATCH 28/32] UCP/PERF: Allocate requests in shared memory --- src/tools/perf/cuda/cuda_kernel.cuh | 22 ++++++++++++++-------- src/tools/perf/cuda/ucp_cuda_kernel.cu | 21 ++++++++++++--------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 6a6f5b7b9da..4f60e4f1e49 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -89,17 +89,17 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from return bits; } -#define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \ +#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \ do { \ switch (cmd) { \ case UCX_PERF_CMD_PUT_SINGLE: \ - func<<>>(__VA_ARGS__); \ + func<<>>(__VA_ARGS__); \ break; \ case UCX_PERF_CMD_PUT_MULTI: \ - func<<>>(__VA_ARGS__); \ + func<<>>(__VA_ARGS__); \ break; \ case UCX_PERF_CMD_PUT_PARTIAL: \ - func<<>>(__VA_ARGS__); \ + func<<>>(__VA_ARGS__); \ break; \ default: \ ucs_error("Unsupported cmd: %d", cmd); \ @@ -113,18 +113,24 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from ucx_perf_cmd_t _cmd = perf.params.command; \ unsigned _blocks = perf.params.device_block_count; \ unsigned _threads = perf.params.device_thread_count; \ + size_t _shared_size = _threads * perf.params.max_outstanding * \ + sizeof(ucp_device_request_t); \ switch (_level) { \ case UCS_DEVICE_LEVEL_THREAD: \ - UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\ + _shared_size, func, __VA_ARGS__); \ break; \ case UCS_DEVICE_LEVEL_WARP: \ - UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\ + _shared_size, func, __VA_ARGS__); \ break; \ case UCS_DEVICE_LEVEL_BLOCK: \ - UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\ + _shared_size, func, __VA_ARGS__); \ break; \ case UCS_DEVICE_LEVEL_GRID: \ - UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads, func, __VA_ARGS__); \ + UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\ + _shared_size, func, __VA_ARGS__); \ break; \ default: \ ucs_error("Unsupported level: %d", _level); \ diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 44d96585d98..5106512c7de 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -19,7 +19,9 @@ class ucp_perf_cuda_request_manager { public: - __device__ ucp_perf_cuda_request_manager(size_t size) : m_size(size) + __device__ + ucp_perf_cuda_request_manager(size_t size, ucp_device_request_t *requests) + : m_size(size), m_requests(&requests[size * threadIdx.x]) { assert(m_size <= CAPACITY); for (size_t i = 0; i < m_size; ++i) { @@ -70,7 +72,7 @@ private: static const size_t CAPACITY = 128; size_t m_size; - ucp_device_request_t m_requests[CAPACITY]; + ucp_device_request_t *m_requests; uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)]; }; @@ -222,9 +224,9 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, template UCS_F_DEVICE ucs_status_t -ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx) +ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, + ucp_device_request_t &req) { - ucp_device_request_t req; ucs_status_t status = ucp_perf_cuda_send_nbx(params, idx, req); if (status != UCS_OK) { return status; @@ -243,9 +245,10 @@ __global__ void ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, ucp_perf_cuda_params params) { + extern __shared__ ucp_device_request_t requests[]; ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); ucx_perf_counter_t max_iters = ctx.max_iters; - ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding); + ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding, requests); ucs_status_t status; for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { @@ -268,7 +271,6 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, } while (status == UCS_ERR_NO_RESOURCE); ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time); - __syncthreads(); } while (request_mgr.get_pending_count() > 0) { @@ -288,27 +290,28 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, ucp_perf_cuda_params params, bool is_sender) { + extern __shared__ ucp_device_request_t requests[]; + ucp_device_request_t &req = requests[threadIdx.x]; ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns(); ucx_perf_counter_t max_iters = ctx.max_iters; ucs_status_t status = UCS_OK; for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) { if (is_sender) { - status = ucp_perf_cuda_send_sync(params, idx); + status = ucp_perf_cuda_send_sync(params, idx, req); if (status != UCS_OK) { break; } ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1); } else { ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1); - status = ucp_perf_cuda_send_sync(params, idx); + status = ucp_perf_cuda_send_sync(params, idx, req); if (status != UCS_OK) { break; } } ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time); - __syncthreads(); } ctx.status = status; From 9cd91ac7908105fa06c95769e6ab19783d08a889 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Mon, 22 Sep 2025 13:46:45 +0000 Subject: [PATCH 29/32] UCP/PERF: Aggregate MT result --- src/tools/perf/cuda/cuda_kernel.cuh | 5 +++-- src/tools/perf/cuda/ucp_cuda_kernel.cu | 1 + src/tools/perf/lib/libperf_int.h | 20 +++----------------- src/tools/perf/lib/ucp_tests.cc | 10 +++++----- src/tools/perf/lib/uct_tests.cc | 8 ++++---- 5 files changed, 16 insertions(+), 28 deletions(-) diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh index 4f60e4f1e49..9d9c42b598e 100644 --- a/src/tools/perf/cuda/cuda_kernel.cuh +++ b/src/tools/perf/cuda/cuda_kernel.cuh @@ -163,11 +163,12 @@ public: size_t msg_length = ucx_perf_get_message_size(&m_perf.params); ucx_perf_counter_t last_completed = 0; ucx_perf_counter_t completed = m_cpu_ctx->completed_iters; - while (1) { + unsigned thread_count = m_perf.params.device_thread_count; + while (true) { ucx_perf_counter_t delta = completed - last_completed; if (delta > 0) { // TODO: calculate latency percentile on kernel - ucx_perf_update(&m_perf, delta, msg_length); + ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length); } else if (completed >= m_perf.max_iter) { break; } diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 5106512c7de..822dbcf22e5 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -271,6 +271,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, } while (status == UCS_ERR_NO_RESOURCE); ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time); + __syncthreads(); } while (request_mgr.get_pending_count() > 0) { diff --git a/src/tools/perf/lib/libperf_int.h b/src/tools/perf/lib/libperf_int.h index 3b40ab9d9b7..3e808ced13d 100644 --- a/src/tools/perf/lib/libperf_int.h +++ b/src/tools/perf/lib/libperf_int.h @@ -229,12 +229,13 @@ static inline void ucx_perf_omp_barrier(ucx_perf_context_t *perf) static UCS_F_ALWAYS_INLINE void ucx_perf_update(ucx_perf_context_t *perf, ucx_perf_counter_t iters, + ucx_perf_counter_t msgs, size_t bytes_per_iter) { perf->current.time = ucs_get_time(); perf->current.iters += iters; - perf->current.bytes += bytes_per_iter * iters; - perf->current.msgs += iters; + perf->current.bytes += msgs * bytes_per_iter; + perf->current.msgs += msgs; if (iters == 1) { perf->timing_queue[perf->timing_queue_head] = perf->current.time - @@ -254,21 +255,6 @@ static UCS_F_ALWAYS_INLINE void ucx_perf_update(ucx_perf_context_t *perf, } } -static UCS_F_ALWAYS_INLINE void -ucx_perf_update_multi(ucx_perf_context_t *perf, ucx_perf_counter_t iters, - size_t bytes) -{ - perf->current.time = ucs_get_time(); - perf->current.iters += iters; - perf->current.bytes += bytes; - perf->current.msgs += iters; - perf->prev_time = perf->current.time; - - if (ucs_likely(perf->current.iters < perf->params.max_iter)) { - ucx_perf_report(perf); - } -} - END_C_DECLS #endif diff --git a/src/tools/perf/lib/ucp_tests.cc b/src/tools/perf/lib/ucp_tests.cc index d683bfbc5cd..061c7f1119e 100644 --- a/src/tools/perf/lib/ucp_tests.cc +++ b/src/tools/perf/lib/ucp_tests.cc @@ -764,7 +764,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base { send(ep, send_buffer, send_length, send_datatype, sn, remote_addr, rkey); recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn); wait_recv_window(m_max_outstanding); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); ++sn; } } else if (my_index == 1) { @@ -773,7 +773,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base { wait_recv_window(m_max_outstanding); send(ep, send_buffer, send_length, send_datatype, sn, remote_addr, rkey, m_perf.current.iters == 0); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); ++sn; } } @@ -828,7 +828,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base { send(ep, send_buffer, send_length, send_datatype, sn, remote_addr, rkey); recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); ++sn; } @@ -837,7 +837,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base { } else if (my_index == 0) { UCX_PERF_TEST_FOREACH(&m_perf) { recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); ++sn; } @@ -850,7 +850,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base { UCX_PERF_TEST_FOREACH(&m_perf) { send(ep, send_buffer, send_length, send_datatype, sn, remote_addr, rkey, m_perf.current.iters == 0); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); ++sn; } diff --git a/src/tools/perf/lib/uct_tests.cc b/src/tools/perf/lib/uct_tests.cc index 32e2de2efde..f7f44d5f213 100644 --- a/src/tools/perf/lib/uct_tests.cc +++ b/src/tools/perf/lib/uct_tests.cc @@ -451,7 +451,7 @@ class uct_perf_test_runner { UCX_PERF_TEST_FOREACH(&m_perf) { send_b(ep, send_sn, send_sn - 1, buffer, length, remote_addr, rkey, NULL); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); do { progress_responder(); @@ -471,7 +471,7 @@ class uct_perf_test_runner { send_b(ep, send_sn, send_sn - 1, buffer, length, remote_addr, rkey, NULL); - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); ++send_sn; } } @@ -538,7 +538,7 @@ class uct_perf_test_runner { &m_completion); } - ucx_perf_update(&m_perf, 1, length); + ucx_perf_update(&m_perf, 1, 1, length); } if (!flow_control) { @@ -699,7 +699,7 @@ class uct_perf_test_runner { ucx_perf_get_time(&m_perf); ucs_assert(outstanding() == 0); if (my_index == 1) { - ucx_perf_update(&m_perf, 0, 0); + ucx_perf_update(&m_perf, 0, 1, 0); } return UCS_OK; From 31793c25ac9d7b74816717958fa69379a693f131 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Mon, 22 Sep 2025 14:28:59 +0000 Subject: [PATCH 30/32] UCP/PERF: Temporary cuda_ipc fix --- src/uct/cuda/cuda_ipc/cuda_ipc.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh index 173a5f019f3..bf24582807c 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh +++ b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh @@ -201,9 +201,8 @@ UCS_F_DEVICE ucs_status_t uct_cuda_ipc_ep_put_single( } __syncthreads(); - if (threadIdx.x == 0) { - comp->count = 0; - } + // TODO: temporary fix, to be removed once cuda_ipc PR is merged + comp->count = 0; return UCS_OK; } From 8265571be23fca2b7d94c8a6c854c9cb3c915b54 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Tue, 23 Sep 2025 06:34:30 +0000 Subject: [PATCH 31/32] UCP/PERF: Added MT CI tests --- contrib/test_jenkins.sh | 12 +++++++++++- .../ucx_perftest_config/test_types_ucp_device_cuda | 10 ++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index d10276fdaec..260c526c75f 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -657,8 +657,18 @@ run_ucx_perftest_cuda_device() { ucx_perftest="$ucx_inst/bin/ucx_perftest" ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda" - # TODO: Run on all GPUs + # TODO: Run on all GPUs & NICs combinations ucp_client_args="-a cuda:0 $(hostname)" + gda_tls="cuda_copy,rc,rc_gda" + cuda_ipc_tls="cuda_copy,rc,cuda_ipc" + + # TODO: Run with cuda_ipc_tls + for tls in "$gda_tls" + do + export UCX_TLS=${tls} + run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0 + done + unset UCX_TLS run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0 } diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda index e984ea2b830..bf2e5bac214 100644 --- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda +++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda @@ -7,3 +7,13 @@ ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256: ucp_device_cuda_multi_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 ucp_device_cuda_partial_bw_1k_1thread -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 + +# Increase number of threads after following fixes: +# - Use thread-local memory instead of shared for requests (limit 48K) +# - Fix WQE size limit of 1024 +ucp_device_cuda_single_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32 -O 8 +ucp_device_cuda_single_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32 -O 8 +ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8 +ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8 +ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8 +ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8 From f9957eb76fec690cc8a5269ea0d73a1388144552 Mon Sep 17 00:00:00 2001 From: Ilia Yastrebov Date: Tue, 23 Sep 2025 09:59:58 +0000 Subject: [PATCH 32/32] UCP/PERF: Reduced dimensions of CI tests due to WQE size 1024 limitation --- .../test_types_ucp_device_cuda | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda index bf2e5bac214..86e24d125ae 100644 --- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda +++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda @@ -1,8 +1,8 @@ # # UCP basic device cuda tests # -ucp_device_cuda_single_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -ucp_device_cuda_single_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 +ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000 +ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000 ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 ucp_device_cuda_multi_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 ucp_device_cuda_partial_bw_1k_1thread -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 @@ -11,9 +11,9 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2 # Increase number of threads after following fixes: # - Use thread-local memory instead of shared for requests (limit 48K) # - Fix WQE size limit of 1024 -ucp_device_cuda_single_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32 -O 8 -ucp_device_cuda_single_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32 -O 8 -ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8 -ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8 -ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8 -ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8 +ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 +ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 +ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 +ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 +ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 +ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2