From a7a6f00857a9322caf02879ed2cb46bd4b8223ff Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 07:39:22 +0000
Subject: [PATCH 01/32] UCP/PERF: Added config for block count

---
 src/tools/perf/api/libperf.h     |  1 +
 src/tools/perf/perftest.c        |  1 +
 src/tools/perf/perftest_params.c | 76 ++++++++++++++++++++++----------
 3 files changed, 55 insertions(+), 23 deletions(-)
diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h
index bec2786007b..62ea68b9f11 100644
--- a/src/tools/perf/api/libperf.h
+++ b/src/tools/perf/api/libperf.h
@@ -284,6 +284,7 @@ typedef struct ucx_perf_params {
     double                 percentile_rank; /* The percentile rank of the percentile reported
                                                in latency tests */
     unsigned               device_thread_count; /* Number of device threads */
+    unsigned               device_block_count; /* Number of device blocks */
 
     void                   *rte_group;      /* Opaque RTE group handle */
     ucx_perf_rte_t         *rte;            /* RTE functions used to exchange data */
diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c
index ca514beaaeb..67cb6543291 100644
--- a/src/tools/perf/perftest.c
+++ b/src/tools/perf/perftest.c
@@ -209,6 +209,7 @@ ucs_status_t init_test_params(perftest_params_t *params)
     params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG;
     params->super.ucp.am_hdr_size   = 0;
     params->super.device_thread_count = 1;
+    params->super.device_block_count  = 1;
     params->super.ucp.is_daemon_mode  = 0;
     params->super.ucp.dmn_local_addr  = empty_addr;
     params->super.ucp.dmn_remote_addr = empty_addr;
diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c
index 6b431b1d526..0ba96a64f1c 100644
--- a/src/tools/perf/perftest_params.c
+++ b/src/tools/perf/perftest_params.c
@@ -131,8 +131,9 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("                        signal - signal-based timer\n");
     printf("\n");
     printf("  UCP only:\n");
-    printf("     -T <threads>   number of threads in the test (%d)\n",
+    printf("     -T <threads>[:<blocks>]   number of threads in the test (%d)\n",
            ctx->params.super.thread_count);
+    printf("                    blocks is optional, it corresponds to the number of device blocks\n");
     printf("     -M <thread>    thread support level for progress engine (single)\n");
     printf("                        single     - only the master thread can access\n");
     printf("                        serialized - one thread can access at a time\n");
@@ -169,23 +170,30 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("\n");
 }
 
-static ucs_status_t parse_device_id(const char *opt_arg, int *device_id)
+static ucs_status_t parse_int(const char *opt_arg, int *value, const char *desc,
+                              int min_value, int max_value)
 {
     char *endptr;
-    int parsed_device_id;
+    int parsed_value;
 
     if (opt_arg == NULL) {
-        ucs_error("device id string is NULL");
+        ucs_error("%s string is NULL", desc);
         return UCS_ERR_INVALID_PARAM;
     }
 
-    parsed_device_id = strtol(opt_arg, &endptr, 10);
-    if ((endptr == opt_arg) || (*endptr != '\0') || (parsed_device_id < 0)) {
-        ucs_error("Failed to parse device id: %s", opt_arg);
+    parsed_value = strtol(opt_arg, &endptr, 10);
+    if ((endptr == opt_arg) || (*endptr != '\0')) {
+        ucs_error("failed to parse %s: %s", desc, opt_arg);
         return UCS_ERR_INVALID_PARAM;
     }
 
-    *device_id = parsed_device_id;
+    if ((parsed_value < min_value) || (parsed_value > max_value)) {
+        ucs_error("value for %s (%s) is out of range: [%d, %d]", desc, opt_arg,
+                  min_value, max_value);
+        return UCS_ERR_INVALID_PARAM;
+    }
+
+    *value = parsed_value;
     return UCS_OK;
 }
 
@@ -235,7 +243,7 @@ parse_accel_device(char *opt_arg, ucx_perf_accel_dev_t *dev)
     if (NULL == token) {
         device_id = UCX_PERF_MEM_DEV_DEFAULT;
     } else {
-        status = parse_device_id(token, &device_id);
+        status = parse_int(token, &device_id, "device id", 0, INT_MAX);
         if (status != UCS_OK) {
             return status;
         }
@@ -307,6 +315,36 @@ static ucs_status_t parse_accel_device_params(const char *opt_arg,
     return UCS_OK;
 }
 
+static ucs_status_t parse_thread_params(const char *opt_arg,
+                                        unsigned *thread_count,
+                                        unsigned *block_count)
+{
+    const char *delim = ":";
+    char *saveptr = NULL;
+    char *token, *arg;
+    ucs_status_t status;
+
+    arg = ucs_alloca(strlen(opt_arg) + 1);
+    strcpy(arg, opt_arg);
+    token = strtok_r(arg, delim, &saveptr);
+    status = parse_int(token, thread_count, "thread count", 1, INT_MAX);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    token = strtok_r(NULL, delim, &saveptr);
+    if (token != NULL) {
+        status = parse_int(token, block_count, "block count", 1, INT_MAX);
+        if (status != UCS_OK) {
+            return status;
+        }
+    } else {
+        *block_count = 1;
+    }
+
+    return UCS_OK;
+}
+
 static ucs_status_t parse_message_sizes_params(const char *opt_arg,
                                                ucx_perf_params_t *params)
 {
@@ -563,8 +601,8 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt,
             return UCS_ERR_INVALID_PARAM;
         }
     case 'T':
-        params->super.thread_count = atoi(opt_arg);
-        return UCS_OK;
+        return parse_thread_params(opt_arg, &params->super.thread_count,
+                                   &params->super.device_block_count);
     case 'A':
         if (!strcmp(opt_arg, "thread") || !strcmp(opt_arg, "thread_spinlock")) {
             params->super.async_mode = UCS_ASYNC_MODE_THREAD_SPINLOCK;
@@ -597,19 +635,11 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt,
             return UCS_ERR_INVALID_PARAM;
         }
     case 'm':
-        if (UCS_OK != parse_mem_type_params(opt_arg,
-                                            &params->super.send_mem_type,
-                                            &params->super.recv_mem_type)) {
-            return UCS_ERR_INVALID_PARAM;
-        }
-        return UCS_OK;
+        return parse_mem_type_params(opt_arg, &params->super.send_mem_type,
+                                     &params->super.recv_mem_type);
     case 'a':
-        if (UCS_OK != parse_accel_device_params(opt_arg,
-                                                &params->super.send_device,
-                                                &params->super.recv_device)) {
-            return UCS_ERR_INVALID_PARAM;
-        }
-        return UCS_OK;
+        return parse_accel_device_params(opt_arg, &params->super.send_device,
+                                         &params->super.recv_device);
     case 'y':
         params->super.flags |= UCX_PERF_TEST_FLAG_AM_RECV_COPY;
         return UCS_OK;

From c27a9a867ab1cae46e10f315a6fcbd563c3c18e2 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 07:46:18 +0000
Subject: [PATCH 02/32] UCP/PERF: Added tests CMDs for single/partial

---
 src/tools/perf/api/libperf.h        |  2 ++
 src/tools/perf/cuda/cuda_kernel.cuh |  4 +++-
 src/tools/perf/lib/libperf.c        |  4 ++++
 src/tools/perf/lib/libperf_memory.c |  2 ++
 src/tools/perf/perftest.c           | 12 ++++++++++++
 5 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h
index 62ea68b9f11..d76b859e016 100644
--- a/src/tools/perf/api/libperf.h
+++ b/src/tools/perf/api/libperf.h
@@ -30,7 +30,9 @@ typedef enum {
 typedef enum {
     UCX_PERF_CMD_AM,
     UCX_PERF_CMD_PUT,
+    UCX_PERF_CMD_PUT_SINGLE,
     UCX_PERF_CMD_PUT_MULTI,
+    UCX_PERF_CMD_PUT_PARTIAL,
     UCX_PERF_CMD_GET,
     UCX_PERF_CMD_ADD,
     UCX_PERF_CMD_FADD,
diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index d79ed37a97f..bc4c79b8370 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -150,7 +150,9 @@ template<typename Runner> ucs_status_t
 ucx_perf_cuda_dispatch(ucx_perf_context_t *perf)
 {
     Runner runner(*perf);
-    if (perf->params.command == UCX_PERF_CMD_PUT_MULTI) {
+    if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) ||
+        (perf->params.command == UCX_PERF_CMD_PUT_SINGLE) ||
+        (perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) {
         if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
             return runner.run_pingpong();
         } else if (perf->params.test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) {
diff --git a/src/tools/perf/lib/libperf.c b/src/tools/perf/lib/libperf.c
index 7a81a9b2228..cd1b88c44a6 100644
--- a/src/tools/perf/lib/libperf.c
+++ b/src/tools/perf/lib/libperf.c
@@ -481,7 +481,9 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params,
                                   attr.cap.put.max_bcopy, attr.cap.put.max_zcopy);
         max_iov  = attr.cap.put.max_iov;
         break;
+    case UCX_PERF_CMD_PUT_SINGLE:
     case UCX_PERF_CMD_PUT_MULTI:
+    case UCX_PERF_CMD_PUT_PARTIAL:
         min_size = 0;
         max_size = 0;
         max_iov  = 0;
@@ -885,7 +887,9 @@ static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params,
     message_size = ucx_perf_get_message_size(params);
     switch (params->command) {
     case UCX_PERF_CMD_PUT:
+    case UCX_PERF_CMD_PUT_SINGLE:
     case UCX_PERF_CMD_PUT_MULTI:
+    case UCX_PERF_CMD_PUT_PARTIAL:
     case UCX_PERF_CMD_GET:
         ucp_params->features |= UCP_FEATURE_RMA;
         break;
diff --git a/src/tools/perf/lib/libperf_memory.c b/src/tools/perf/lib/libperf_memory.c
index c74c7f79f13..0b2a14c1e5f 100644
--- a/src/tools/perf/lib/libperf_memory.c
+++ b/src/tools/perf/lib/libperf_memory.c
@@ -267,7 +267,9 @@ ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf)
     case UCX_PERF_CMD_PUT:
         flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT;
         break;
+    case UCX_PERF_CMD_PUT_SINGLE:
     case UCX_PERF_CMD_PUT_MULTI:
+    case UCX_PERF_CMD_PUT_PARTIAL:
         flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT | UCT_MD_MEM_ACCESS_REMOTE_ATOMIC;
         break;
     case UCX_PERF_CMD_GET:
diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c
index 67cb6543291..f79b6214676 100644
--- a/src/tools/perf/perftest.c
+++ b/src/tools/perf/perftest.c
@@ -80,12 +80,24 @@ test_type_t tests[] = {
     {"ucp_put_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
      "put bandwidth", "overhead", 32},
 
+    {"ucp_put_single_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE, UCX_PERF_TEST_TYPE_STREAM_UNI,
+     "put single bandwidth", "overhead", 32},
+
+    {"ucp_put_single_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE, UCX_PERF_TEST_TYPE_PINGPONG,
+     "put single latency", "latency", 1},
+
     {"ucp_put_multi_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_MULTI, UCX_PERF_TEST_TYPE_STREAM_UNI,
      "put multi bandwidth", "overhead", 32},
 
     {"ucp_put_multi_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_MULTI, UCX_PERF_TEST_TYPE_PINGPONG,
      "put multi latency", "latency", 32},
 
+    {"ucp_put_partial_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_PARTIAL, UCX_PERF_TEST_TYPE_STREAM_UNI,
+     "put partial bandwidth", "overhead", 32},
+
+    {"ucp_put_partial_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_PARTIAL, UCX_PERF_TEST_TYPE_PINGPONG,
+     "put partial latency", "latency", 32},
+
     {"ucp_get", UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI,
      "get latency / bandwidth / message rate", "latency", 1},
 

From 0927c2f18703e0ea5f17e937830853bc3b81e065 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 07:48:44 +0000
Subject: [PATCH 03/32] UCP/PERF: Compact form for message sizes

---
 src/tools/perf/perftest_params.c | 63 ++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c
index 0ba96a64f1c..eb188bb0583 100644
--- a/src/tools/perf/perftest_params.c
+++ b/src/tools/perf/perftest_params.c
@@ -74,6 +74,7 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("     -s <size>      list of scatter-gather sizes for single message (%zu)\n",
                                 ctx->params.super.msg_size_list[0]);
     printf("                    for example: \"-s 16,48,8192,8192,14\"\n");
+    printf("                    compact form example: \"-s 1024:16 expands to [1024, ..., 1024] with 16 elements\n");
     printf("     -m <send mem type>[,<recv mem type>]\n");
     printf("                    memory type of message for sender and receiver (host)\n");
     print_memory_type_usage();
@@ -131,7 +132,8 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("                        signal - signal-based timer\n");
     printf("\n");
     printf("  UCP only:\n");
-    printf("     -T <threads>[:<blocks>]   number of threads in the test (%d)\n",
+    printf("     -T <threads>[:<blocks>]\n");
+    printf("                    number of threads in the test (%d)\n",
            ctx->params.super.thread_count);
     printf("                    blocks is optional, it corresponds to the number of device blocks\n");
     printf("     -M <thread>    thread support level for progress engine (single)\n");
@@ -345,8 +347,8 @@ static ucs_status_t parse_thread_params(const char *opt_arg,
     return UCS_OK;
 }
 
-static ucs_status_t parse_message_sizes_params(const char *opt_arg,
-                                               ucx_perf_params_t *params)
+static ucs_status_t
+parse_message_sizes_list(const char *opt_arg, ucx_perf_params_t *params)
 {
     const char delim = ',';
     size_t *msg_size_list, token_num, token_it;
@@ -388,6 +390,61 @@ static ucs_status_t parse_message_sizes_params(const char *opt_arg,
     return UCS_OK;
 }
 
+static ucs_status_t
+parse_message_sizes_compact(const char *opt_arg, ucx_perf_params_t *params)
+{
+    const char *delim = ":";
+    char *saveptr = NULL;
+    char *token, *arg;
+    int msg_size, element_count, i;
+    size_t *msg_size_list;
+    ucs_status_t status;
+
+
+    arg = ucs_alloca(strlen(opt_arg) + 1);
+    strcpy(arg, opt_arg);
+    token = strtok_r(arg, delim, &saveptr);
+    status = parse_int(token, &msg_size, "message size", 1, INT_MAX);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    token = strtok_r(NULL, delim, &saveptr);
+    status = parse_int(token, &element_count, "elements", 1, INT_MAX);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    msg_size_list = realloc(params->msg_size_list,
+                            sizeof(*params->msg_size_list) * element_count);
+    if (NULL == msg_size_list) {
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    params->msg_size_list = msg_size_list;
+    for (i = 0; i < element_count; ++i) {
+        params->msg_size_list[i] = msg_size;
+    }
+
+    params->msg_size_cnt = element_count;
+    return UCS_OK;
+}
+
+static int is_compact_form(const char *input)
+{
+    return strchr(input, ':') != NULL;
+}
+
+static ucs_status_t parse_message_sizes_params(const char *opt_arg,
+                                               ucx_perf_params_t *params)
+{
+    if (is_compact_form(opt_arg)) {
+        return parse_message_sizes_compact(opt_arg, params);
+    }
+
+    return parse_message_sizes_list(opt_arg, params);
+}
+
 static ucs_status_t parse_ucp_datatype_params(const char *opt_arg,
                                               ucp_perf_datatype_t *datatype)
 {

From 5a4be0f6232bd81d3c082a5d522c8753621342b2 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 08:02:01 +0000
Subject: [PATCH 04/32] UCP/PERF: Device level option

---
 src/tools/perf/api/libperf.h     |  2 ++
 src/tools/perf/perftest.c        |  1 +
 src/tools/perf/perftest.h        |  2 +-
 src/tools/perf/perftest_params.c | 22 ++++++++++++++++++++++
 src/ucs/sys/device_code.h        | 18 ++++++++++++++++++
 5 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h
index d76b859e016..2e1338b2423 100644
--- a/src/tools/perf/api/libperf.h
+++ b/src/tools/perf/api/libperf.h
@@ -11,6 +11,7 @@
 #define UCX_LIBPERF_H
 
 #include <ucs/sys/compiler.h>
+#include <ucs/sys/device_code.h>
 
 BEGIN_C_DECLS
 
@@ -267,6 +268,7 @@ typedef struct ucx_perf_params {
     ucs_memory_type_t      recv_mem_type;   /* Recv memory type */
     ucx_perf_accel_dev_t   send_device;     /* Send memory device for gdaki */
     ucx_perf_accel_dev_t   recv_device;     /* Recv memory device for gdaki */
+    ucs_device_level_t     device_level;    /* Device level for gdaki */
     unsigned               flags;           /* See ucx_perf_test_flags. */
 
     size_t                 *msg_size_list;  /* Test message sizes list. The size
diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c
index f79b6214676..00d782e5632 100644
--- a/src/tools/perf/perftest.c
+++ b/src/tools/perf/perftest.c
@@ -215,6 +215,7 @@ ucs_status_t init_test_params(perftest_params_t *params)
     params->super.recv_mem_type     = UCS_MEMORY_TYPE_HOST;
     params->super.send_device       = default_dev;
     params->super.recv_device       = default_dev;
+    params->super.device_level      = UCS_DEVICE_LEVEL_THREAD;
     params->super.msg_size_cnt      = 1;
     params->super.iov_stride        = 0;
     params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;
diff --git a/src/tools/perf/perftest.h b/src/tools/perf/perftest.h
index aa487b1aded..d61b44108fa 100644
--- a/src/tools/perf/perftest.h
+++ b/src/tools/perf/perftest.h
@@ -19,7 +19,7 @@
 #endif
 
 #define TL_RESOURCE_NAME_NONE   "<none>"
-#define TEST_PARAMS_ARGS        "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:a:R:lyz"
+#define TEST_PARAMS_ARGS        "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:a:R:lyzL:"
 #define TEST_ID_UNDEFINED       -1
 
 #define DEFAULT_DAEMON_PORT     1338
diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c
index eb188bb0583..6c3210d645e 100644
--- a/src/tools/perf/perftest_params.c
+++ b/src/tools/perf/perftest_params.c
@@ -71,6 +71,11 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("                   Accelerator device type and device id to use for running the test.\n");
     printf("                    device id is optional, it corresponds to the index of\n");
     printf("                    the device in the list of available devices\n");
+    printf("     -L <level>     device cooperation level for gdaki (thread)\n");
+    printf("                    thread - thread level\n");
+    printf("                    warp   - warp level\n");
+    printf("                    block  - block level\n");
+    printf("                    grid   - grid level\n");
     printf("     -s <size>      list of scatter-gather sizes for single message (%zu)\n",
                                 ctx->params.super.msg_size_list[0]);
     printf("                    for example: \"-s 16,48,8192,8192,14\"\n");
@@ -445,6 +450,21 @@ static ucs_status_t parse_message_sizes_params(const char *opt_arg,
     return parse_message_sizes_list(opt_arg, params);
 }
 
+static ucs_status_t parse_device_level(const char *opt_arg,
+                                       ucs_device_level_t *device_level)
+{
+    ucs_device_level_t level;
+    for (level = UCS_DEVICE_LEVEL_THREAD; level <= UCS_DEVICE_LEVEL_GRID; ++level) {
+        if (!strcmp(opt_arg, ucs_device_level_name(level))) {
+            *device_level = level;
+            return UCS_OK;
+        }
+    }
+
+    ucs_error("Invalid option argument for device level: %s", opt_arg);
+    return UCS_ERR_INVALID_PARAM;
+}
+
 static ucs_status_t parse_ucp_datatype_params(const char *opt_arg,
                                               ucp_perf_datatype_t *datatype)
 {
@@ -697,6 +717,8 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt,
     case 'a':
         return parse_accel_device_params(opt_arg, &params->super.send_device,
                                          &params->super.recv_device);
+    case 'L':
+        return parse_device_level(opt_arg, &params->super.device_level);
     case 'y':
         params->super.flags |= UCX_PERF_TEST_FLAG_AM_RECV_COPY;
         return UCS_OK;
diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h
index d7aa07f3f5b..d1b5e5dff89 100644
--- a/src/ucs/sys/device_code.h
+++ b/src/ucs/sys/device_code.h
@@ -30,6 +30,24 @@ typedef enum {
 } ucs_device_level_t;
 
 
+static UCS_F_ALWAYS_INLINE const char*
+ucs_device_level_name(ucs_device_level_t level)
+{
+    switch (level) {
+    case UCS_DEVICE_LEVEL_THREAD:
+        return "thread";
+    case UCS_DEVICE_LEVEL_WARP:
+        return "warp";
+    case UCS_DEVICE_LEVEL_BLOCK:
+        return "block";
+    case UCS_DEVICE_LEVEL_GRID:
+        return "grid";
+    default:
+        return "unknown";
+    }
+}
+
+
 /*
  * Read a 64-bit atomic value from a global memory address.
  */

From 4c4a3a6ef173f02f01322185fd14f537558cfd80 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 08:22:41 +0000
Subject: [PATCH 05/32] UCP/PERF: Support for IOV

---
 src/tools/perf/cuda/cuda_kernel.cuh    | 24 +++++++++
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 67 +++++++++++++++++++++-----
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index bc4c79b8370..8c7bfae2d60 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -13,6 +13,8 @@
 #include <ucs/sys/device_code.h>
 #include <cuda_runtime.h>
 
+#include <memory>
+
 
 typedef unsigned long long ucx_perf_cuda_time_t;
 
@@ -88,6 +90,28 @@ UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from)
     return bits;
 }
 
+struct ucx_perf_cuda_element {
+    unsigned index;
+    void     *address;
+    uint64_t remote_address;
+    size_t   length;
+};
+
+struct ucx_perf_cuda_element_list {
+    size_t                count;
+    ucx_perf_cuda_element elements[0];
+};
+
+template <typename T>
+using ucx_perf_cuda_unique_ptr = std::unique_ptr<T, void(*)(T*)>;
+
+template <typename T>
+ucx_perf_cuda_unique_ptr<T> ucx_perf_cuda_make_unique(size_t size) {
+    T* raw = nullptr;
+    CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &raw, size);
+    return ucx_perf_cuda_unique_ptr<T>(raw, [](T* p){ if (p) cudaFree(p); });
+}
+
 class ucx_perf_cuda_test_runner {
 public:
     ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index fb31396e60d..e09dd3fff1e 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -13,8 +13,6 @@
 #include <ucp/api/device/ucp_device_impl.h>
 #include <tools/perf/lib/ucp_tests.h>
 
-#include <memory>
-
 
 class ucp_perf_cuda_request_manager {
 public:
@@ -217,6 +215,11 @@ public:
             return UCS_ERR_NO_MEMORY;
         }
 
+        unique_element_list_ptr element_list = create_element_list();
+        if (!element_list) {
+            return UCS_ERR_NO_MEMORY;
+        }
+
         ucp_perf_barrier(&m_perf);
         ucx_perf_test_start_clock(&m_perf);
 
@@ -240,15 +243,25 @@ public:
         size_t length     = ucx_perf_get_message_size(&m_perf.params);
         unsigned my_index = rte_call(&m_perf, group_index);
 
-        ucp_perf_barrier(&m_perf);
-        ucx_perf_test_start_clock(&m_perf);
+        unique_mem_list_ptr handle(nullptr, nullptr);
+        unique_element_list_ptr element_list(nullptr, nullptr);
 
         if (my_index == 1) {
-            unique_mem_list_ptr handle = create_mem_list();
+            handle = create_mem_list();
             if (!handle) {
                 return UCS_ERR_NO_MEMORY;
             }
 
+            element_list = create_element_list();
+            if (!element_list) {
+                return UCS_ERR_NO_MEMORY;
+            }
+        }
+
+        ucp_perf_barrier(&m_perf);
+        ucx_perf_test_start_clock(&m_perf);
+
+        if (my_index == 1) {
             unsigned thread_count = m_perf.params.device_thread_count;
             ucp_perf_cuda_put_multi_bw_kernel
                 <UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
@@ -274,19 +287,22 @@ private:
 
     unique_mem_list_ptr create_mem_list() const
     {
-        ucp_device_mem_list_elem_t elem;
-        elem.field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
-                          UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY;
-        elem.memh       = m_perf.ucp.send_memh;
-        elem.rkey       = m_perf.ucp.rkey;
+        size_t count = m_perf.params.msg_size_cnt;
+        ucp_device_mem_list_elem_t elems[count];
+        for (size_t i = 0; i < count; ++i) {
+            elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
+                                  UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY;
+            elems[i].memh       = m_perf.ucp.send_memh;
+            elems[i].rkey       = m_perf.ucp.rkey;
+        }
 
         ucp_device_mem_list_params_t params;
         params.field_mask   = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
                               UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
                               UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS;
         params.element_size = sizeof(ucp_device_mem_list_elem_t);
-        params.num_elements = 1;
-        params.elements     = &elem;
+        params.num_elements = count;
+        params.elements     = elems;
 
         ucp_device_mem_list_handle_h mem_list;
         ucs_status_t status = ucp_device_mem_list_create(m_perf.ucp.ep, &params,
@@ -297,6 +313,33 @@ private:
 
         return unique_mem_list_ptr(mem_list, ucp_device_mem_list_release);
     }
+
+    using unique_element_list_ptr =
+        ucx_perf_cuda_unique_ptr<ucx_perf_cuda_element_list>;
+
+    unique_element_list_ptr create_element_list() const
+    {
+        size_t count                 = m_perf.params.msg_size_cnt;
+        unique_element_list_ptr list =
+            ucx_perf_cuda_make_unique<ucx_perf_cuda_element_list>(
+                sizeof(ucx_perf_cuda_element_list) +
+                (count * sizeof(ucx_perf_cuda_element)));
+
+        size_t offset = 0;
+        for (unsigned i = 0; i < count; ++i) {
+            ucx_perf_cuda_element elem = {
+                .index          = i,
+                .address        = (char *)m_perf.send_buffer + offset,
+                .remote_address = m_perf.ucp.remote_addr + offset,
+                .length         = m_perf.params.msg_size_list[i]
+            };
+            offset += elem.length;
+            CUDA_CALL_ERR(cudaMemcpy, &list->elements[i], &elem, sizeof(elem),
+                          cudaMemcpyHostToDevice);
+        }
+
+        return list;
+    }
 };
 
 ucx_perf_device_dispatcher_t ucx_perf_cuda_dispatcher;

From 5a59ee2ac6a07ef213df2691c5312a8976803729 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 09:40:32 +0000
Subject: [PATCH 06/32] UCP/PERF: Kernel dispatch macro

---
 src/tools/perf/cuda/cuda_kernel.cuh    | 43 ++++++++++++++++++++++++++
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 25 +++++++--------
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 8c7bfae2d60..cff1f263cdd 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -112,6 +112,49 @@ ucx_perf_cuda_unique_ptr<T> ucx_perf_cuda_make_unique(size_t size) {
     return ucx_perf_cuda_unique_ptr<T>(raw, [](T* p){ if (p) cudaFree(p); });
 }
 
+#define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \
+    do { \
+        switch (cmd) { \
+        case UCX_PERF_CMD_PUT_SINGLE: \
+            func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads>>>(__VA_ARGS__); \
+            break; \
+        case UCX_PERF_CMD_PUT_MULTI: \
+            func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads>>>(__VA_ARGS__); \
+            break; \
+        case UCX_PERF_CMD_PUT_PARTIAL: \
+            func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads>>>(__VA_ARGS__); \
+            break; \
+        default: \
+            ucs_error("Unsupported cmd: %d", cmd); \
+            break; \
+        } \
+    } while (0)
+
+#define UCX_KERNEL_DISPATCH(perf, func, ...) \
+    do { \
+        ucs_device_level_t _level = perf.params.device_level; \
+        ucx_perf_cmd_t _cmd       = perf.params.command; \
+        unsigned _blocks          = perf.params.device_block_count; \
+        unsigned _threads         = perf.params.device_thread_count; \
+        switch (_level) { \
+        case UCS_DEVICE_LEVEL_THREAD: \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            break; \
+        case UCS_DEVICE_LEVEL_WARP: \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            break; \
+        case UCS_DEVICE_LEVEL_BLOCK: \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            break; \
+        case UCS_DEVICE_LEVEL_GRID: \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            break; \
+        default: \
+            ucs_error("Unsupported level: %d", _level); \
+            break; \
+        } \
+    } while (0)
+
 class ucx_perf_cuda_test_runner {
 public:
     ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index e09dd3fff1e..089ebb259b7 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -71,7 +71,7 @@ private:
     uint8_t              m_pending[UCX_BITSET_SIZE(CAPACITY)];
 };
 
-template<ucs_device_level_t level>
+template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
                                   ucp_device_mem_list_handle_h mem_list,
@@ -149,7 +149,7 @@ ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list,
     return status;
 }
 
-template<ucs_device_level_t level>
+template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
                                        ucp_device_mem_list_handle_h mem_list,
@@ -206,9 +206,8 @@ public:
 
     ucs_status_t run_pingpong()
     {
-        size_t length         = ucx_perf_get_message_size(&m_perf.params);
-        unsigned thread_count = m_perf.params.device_thread_count;
-        unsigned my_index     = rte_call(&m_perf, group_index);
+        size_t length     = ucx_perf_get_message_size(&m_perf.params);
+        unsigned my_index = rte_call(&m_perf, group_index);
 
         unique_mem_list_ptr handle = create_mem_list();
         if (!handle) {
@@ -223,10 +222,10 @@ public:
         ucp_perf_barrier(&m_perf);
         ucx_perf_test_start_clock(&m_perf);
 
-        ucp_perf_cuda_put_multi_latency_kernel
-            <UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
-                gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
-                m_perf.ucp.remote_addr, length, m_perf.recv_buffer, my_index);
+        UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel,
+                            gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
+                            m_perf.ucp.remote_addr, length, m_perf.recv_buffer,
+                            my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
 
         wait_for_kernel(length);
@@ -262,11 +261,9 @@ public:
         ucx_perf_test_start_clock(&m_perf);
 
         if (my_index == 1) {
-            unsigned thread_count = m_perf.params.device_thread_count;
-            ucp_perf_cuda_put_multi_bw_kernel
-                <UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
-                    gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
-                    m_perf.ucp.remote_addr, length);
+            UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel,
+                                gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
+                                m_perf.ucp.remote_addr, length);
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
             wait_for_kernel(length);
         } else if (my_index == 0) {

From 0fcc18c22fc51845d311f2a59d56ec09f4c66a26 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 09:45:33 +0000
Subject: [PATCH 07/32] UCP/PERF: Fixed coverity warning

---
 test/gtest/common/test_perf.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/gtest/common/test_perf.cc b/test/gtest/common/test_perf.cc
index 01d36c4ae74..93d3171311d 100644
--- a/test/gtest/common/test_perf.cc
+++ b/test/gtest/common/test_perf.cc
@@ -217,6 +217,8 @@ void test_perf::test_params_init(const test_spec &test,
     params.send_device         = {UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};
     params.recv_device         = {UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};
     params.device_thread_count = 1;
+    params.device_block_count  = 1;
+    params.device_level        = UCS_DEVICE_LEVEL_THREAD;
     params.percentile_rank     = 50.0;
 
     memset(params.uct.md_name, 0, sizeof(params.uct.md_name));

From 50fbf7935d1eed97cd240ed1579c4653126eb4d8 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 12:00:05 +0000
Subject: [PATCH 08/32] UCP/PERF: Fixed build error

---
 src/ucs/sys/device_code.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h
index d1b5e5dff89..768a082ca65 100644
--- a/src/ucs/sys/device_code.h
+++ b/src/ucs/sys/device_code.h
@@ -7,6 +7,7 @@
 #ifndef UCS_DEVICE_CODE_H
 #define UCS_DEVICE_CODE_H
 
+#include <ucs/sys/compiler_def.h>
 #include <stdint.h>
 
 /*

From d99de4be48c29fda7f2fb86145c80874034dc31f Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 12:17:26 +0000
Subject: [PATCH 09/32] UCP/PERF: Common send function for all APIs

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 117 ++++++++++++++-----------
 1 file changed, 65 insertions(+), 52 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 089ebb259b7..97e981c90db 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -71,13 +71,55 @@ private:
     uint8_t              m_pending[UCX_BITSET_SIZE(CAPACITY)];
 };
 
+template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
+UCS_F_DEVICE ucs_status_t
+ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list,
+                       ucx_perf_cuda_element_list &element_list,
+                       ucp_device_request_t &req)
+{
+    switch (cmd) {
+    case UCX_PERF_CMD_PUT_SINGLE: {
+        ucx_perf_cuda_element &elem = element_list.elements[0];
+        return ucp_device_put_single<level>(mem_list, elem.index,
+                                            elem.address, elem.remote_address,
+                                            elem.length, 0, &req);
+    }
+    // case UCX_PERF_CMD_PUT_MULTI:
+    //     return ucp_device_put_multi<level>(mem_list, element_list->elements,
+    //                                        element_list->count, 0, &req);
+    // case UCX_PERF_CMD_PUT_PARTIAL:
+    //     return ucp_device_put_partial<level>(mem_list, element_list->elements,
+    //                                         element_list->count, 0, &req);
+    }
+
+    return UCS_ERR_INVALID_PARAM;
+}
+
+template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
+UCS_F_DEVICE ucs_status_t
+ucp_perf_cuda_send_sync(ucp_device_mem_list_handle_h mem_list,
+                        ucx_perf_cuda_element_list &element_list)
+{
+    ucp_device_request_t req;
+    ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(mem_list,
+                                                             element_list, req);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    do {
+        status = ucp_device_progress_req<level>(&req);
+    } while (status == UCS_INPROGRESS);
+
+    return status;
+}
+
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
                                   ucp_device_mem_list_handle_h mem_list,
-                                  unsigned mem_list_index,
-                                  const void *address, uint64_t remote_address,
-                                  size_t length)
+                                  ucx_perf_cuda_element_list &element_list,
+                                  const void *address, size_t length)
 {
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
@@ -95,8 +137,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 
         *sn = idx + 1;
         ucp_device_request_t &req = request_mgr.get_request();
-        status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
-                                              remote_address, length, 0, &req);
+        status = ucp_perf_cuda_send_nbx<level, cmd>(mem_list, element_list, req);
         if (status != UCS_OK) {
             break;
         }
@@ -115,47 +156,12 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
     ctx.status = status;
 }
 
-__global__ void
-ucp_perf_cuda_wait_multi_bw_kernel(ucx_perf_cuda_context &ctx,
-                                   const void *address, size_t length)
-{
-    volatile uint64_t *sn = ucx_perf_cuda_get_sn(address, length);
-    while (*sn < ctx.max_iters) {
-        __nanosleep(100000); // 100us
-    }
-
-    ctx.status = UCS_OK;
-}
-
-template<ucs_device_level_t level>
-UCS_F_DEVICE ucs_status_t
-ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list,
-                         unsigned mem_list_index, const void *address,
-                         uint64_t remote_address, size_t length)
-{
-    ucp_device_request_t req;
-    ucs_status_t status;
-
-    status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
-                                          remote_address, length, 0, &req);
-    if (status != UCS_OK) {
-        return status;
-    }
-
-    do {
-        status = ucp_device_progress_req<level>(&req);
-    } while (status == UCS_INPROGRESS);
-
-    return status;
-}
-
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
                                        ucp_device_mem_list_handle_h mem_list,
-                                       unsigned mem_list_index,
+                                       ucx_perf_cuda_element_list &element_list,
                                        const void *address,
-                                       uint64_t remote_address,
                                        size_t length, const void *recv_address,
                                        bool is_sender)
 {
@@ -168,9 +174,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         if (is_sender) {
             *sn    = idx + 1;
-            status = ucp_perf_cuda_put_single<level>(mem_list, mem_list_index,
-                                                     address, remote_address,
-                                                     length);
+            status = ucp_perf_cuda_send_sync<level, cmd>(mem_list, element_list);
             if (status != UCS_OK) {
                 break;
             }
@@ -178,9 +182,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
         } else {
             ucx_perf_cuda_wait_sn(recv_sn, idx + 1);
             *sn    = idx + 1;
-            status = ucp_perf_cuda_put_single<level>(mem_list, mem_list_index,
-                                                     address, remote_address,
-                                                     length);
+            status = ucp_perf_cuda_send_sync<level, cmd>(mem_list, element_list);
             if (status != UCS_OK) {
                 break;
             }
@@ -193,6 +195,18 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
     ctx.status = status;
 }
 
+__global__ void
+ucp_perf_cuda_wait_multi_bw_kernel(ucx_perf_cuda_context &ctx,
+                                   const void *address, size_t length)
+{
+    volatile uint64_t *sn = ucx_perf_cuda_get_sn(address, length);
+    while (*sn < ctx.max_iters) {
+        __nanosleep(100000); // 100us
+    }
+
+    ctx.status = UCS_OK;
+}
+
 class ucp_perf_cuda_test_runner : public ucx_perf_cuda_test_runner {
 public:
     ucp_perf_cuda_test_runner(ucx_perf_context_t &perf) :
@@ -223,9 +237,8 @@ public:
         ucx_perf_test_start_clock(&m_perf);
 
         UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel,
-                            gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
-                            m_perf.ucp.remote_addr, length, m_perf.recv_buffer,
-                            my_index);
+                            gpu_ctx(), handle.get(), *element_list, m_perf.send_buffer,
+                            length, m_perf.recv_buffer, my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
 
         wait_for_kernel(length);
@@ -262,8 +275,8 @@ public:
 
         if (my_index == 1) {
             UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel,
-                                gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
-                                m_perf.ucp.remote_addr, length);
+                                gpu_ctx(), handle.get(), *element_list,
+                                m_perf.send_buffer, length);
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
             wait_for_kernel(length);
         } else if (my_index == 0) {

From 7e7fd34d439d2b671cc16442004c2b5d26b22c0e Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Wed, 17 Sep 2025 13:18:53 +0000
Subject: [PATCH 10/32] UCP/PERF: Common element list for all APIs

---
 src/tools/perf/cuda/cuda_kernel.cuh    | 38 ++++++++--------
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 60 ++++++++++++++------------
 2 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index cff1f263cdd..4291c1d6ef6 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -90,27 +90,29 @@ UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from)
     return bits;
 }
 
-struct ucx_perf_cuda_element {
-    unsigned index;
-    void     *address;
-    uint64_t remote_address;
-    size_t   length;
-};
-
 struct ucx_perf_cuda_element_list {
-    size_t                count;
-    ucx_perf_cuda_element elements[0];
-};
+    size_t   m_count;
+    unsigned *m_indices;
+    void     **m_addresses;
+    uint64_t *m_remote_addresses;
+    size_t   *m_lengths;
 
-template <typename T>
-using ucx_perf_cuda_unique_ptr = std::unique_ptr<T, void(*)(T*)>;
+    ucx_perf_cuda_element_list(size_t count) : m_count(count)
+    {
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_indices, count * sizeof(unsigned));
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_addresses, count * sizeof(void*));
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_remote_addresses, count * sizeof(uint64_t));
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_lengths, count * sizeof(size_t));
+    }
 
-template <typename T>
-ucx_perf_cuda_unique_ptr<T> ucx_perf_cuda_make_unique(size_t size) {
-    T* raw = nullptr;
-    CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &raw, size);
-    return ucx_perf_cuda_unique_ptr<T>(raw, [](T* p){ if (p) cudaFree(p); });
-}
+    ~ucx_perf_cuda_element_list()
+    {
+        CUDA_CALL_WARN(cudaFree, m_indices);
+        CUDA_CALL_WARN(cudaFree, m_addresses);
+        CUDA_CALL_WARN(cudaFree, m_remote_addresses);
+        CUDA_CALL_WARN(cudaFree, m_lengths);
+    }
+};
 
 #define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \
     do { \
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 97e981c90db..c803ae6aba4 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -13,6 +13,8 @@
 #include <ucp/api/device/ucp_device_impl.h>
 #include <tools/perf/lib/ucp_tests.h>
 
+#include <vector>
+
 
 class ucp_perf_cuda_request_manager {
 public:
@@ -78,12 +80,11 @@ ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list,
                        ucp_device_request_t &req)
 {
     switch (cmd) {
-    case UCX_PERF_CMD_PUT_SINGLE: {
-        ucx_perf_cuda_element &elem = element_list.elements[0];
-        return ucp_device_put_single<level>(mem_list, elem.index,
-                                            elem.address, elem.remote_address,
-                                            elem.length, 0, &req);
-    }
+    case UCX_PERF_CMD_PUT_SINGLE:
+        return ucp_device_put_single<level>(mem_list, element_list.m_indices[0],
+                                            element_list.m_addresses[0],
+                                            element_list.m_remote_addresses[0],
+                                            element_list.m_lengths[0], 0, &req);
     // case UCX_PERF_CMD_PUT_MULTI:
     //     return ucp_device_put_multi<level>(mem_list, element_list->elements,
     //                                        element_list->count, 0, &req);
@@ -256,7 +257,7 @@ public:
         unsigned my_index = rte_call(&m_perf, group_index);
 
         unique_mem_list_ptr handle(nullptr, nullptr);
-        unique_element_list_ptr element_list(nullptr, nullptr);
+        unique_element_list_ptr element_list;
 
         if (my_index == 1) {
             handle = create_mem_list();
@@ -324,31 +325,36 @@ private:
         return unique_mem_list_ptr(mem_list, ucp_device_mem_list_release);
     }
 
-    using unique_element_list_ptr =
-        ucx_perf_cuda_unique_ptr<ucx_perf_cuda_element_list>;
+    using unique_element_list_ptr = std::unique_ptr<ucx_perf_cuda_element_list>;
 
     unique_element_list_ptr create_element_list() const
     {
-        size_t count                 = m_perf.params.msg_size_cnt;
-        unique_element_list_ptr list =
-            ucx_perf_cuda_make_unique<ucx_perf_cuda_element_list>(
-                sizeof(ucx_perf_cuda_element_list) +
-                (count * sizeof(ucx_perf_cuda_element)));
-
-        size_t offset = 0;
-        for (unsigned i = 0; i < count; ++i) {
-            ucx_perf_cuda_element elem = {
-                .index          = i,
-                .address        = (char *)m_perf.send_buffer + offset,
-                .remote_address = m_perf.ucp.remote_addr + offset,
-                .length         = m_perf.params.msg_size_list[i]
-            };
-            offset += elem.length;
-            CUDA_CALL_ERR(cudaMemcpy, &list->elements[i], &elem, sizeof(elem),
-                          cudaMemcpyHostToDevice);
+        size_t count = m_perf.params.msg_size_cnt;
+        unique_element_list_ptr element_list =
+            unique_element_list_ptr(new ucx_perf_cuda_element_list(count));
+
+        std::vector<unsigned> indices(count);
+        std::vector<void*> addresses(count);
+        std::vector<uint64_t> remote_addresses(count);
+        std::vector<size_t> lengths(count);
+        for (unsigned i = 0, offset = 0; i < count; ++i) {
+            indices[i]          = i;
+            addresses[i]        = (char *)m_perf.send_buffer + offset;
+            remote_addresses[i] = m_perf.ucp.remote_addr + offset;
+            lengths[i]          = m_perf.params.msg_size_list[i];
+            offset             += lengths[i];
         }
 
-        return list;
+        CUDA_CALL_ERR(cudaMemcpy, element_list->m_indices, indices.data(),
+                      count * sizeof(unsigned), cudaMemcpyHostToDevice);
+        CUDA_CALL_ERR(cudaMemcpy, element_list->m_addresses, addresses.data(),
+                      count * sizeof(void*), cudaMemcpyHostToDevice);
+        CUDA_CALL_ERR(cudaMemcpy, element_list->m_remote_addresses,
+                      remote_addresses.data(), count * sizeof(uint64_t),
+                      cudaMemcpyHostToDevice);
+        CUDA_CALL_ERR(cudaMemcpy, element_list->m_lengths, lengths.data(),
+                      count * sizeof(size_t), cudaMemcpyHostToDevice);
+        return element_list;
     }
 };
 

From 0733535fa373a52482577b18a8c107a11b1df61b Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 07:47:58 +0000
Subject: [PATCH 11/32] UCP/PERF: Common params for all APIs

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  24 ---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 216 ++++++++++++-------------
 2 files changed, 106 insertions(+), 134 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 4291c1d6ef6..f24a4ac568b 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -90,30 +90,6 @@ UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from)
     return bits;
 }
 
-struct ucx_perf_cuda_element_list {
-    size_t   m_count;
-    unsigned *m_indices;
-    void     **m_addresses;
-    uint64_t *m_remote_addresses;
-    size_t   *m_lengths;
-
-    ucx_perf_cuda_element_list(size_t count) : m_count(count)
-    {
-        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_indices, count * sizeof(unsigned));
-        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_addresses, count * sizeof(void*));
-        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_remote_addresses, count * sizeof(uint64_t));
-        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, &m_lengths, count * sizeof(size_t));
-    }
-
-    ~ucx_perf_cuda_element_list()
-    {
-        CUDA_CALL_WARN(cudaFree, m_indices);
-        CUDA_CALL_WARN(cudaFree, m_addresses);
-        CUDA_CALL_WARN(cudaFree, m_remote_addresses);
-        CUDA_CALL_WARN(cudaFree, m_lengths);
-    }
-};
-
 #define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \
     do { \
         switch (cmd) { \
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index c803ae6aba4..0476262f892 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -73,18 +73,106 @@ private:
     uint8_t              m_pending[UCX_BITSET_SIZE(CAPACITY)];
 };
 
+struct ucp_perf_cuda_params {
+    ucp_device_mem_list_handle_h mem_list;
+    unsigned                     *indices;
+    void                         **addresses;
+    uint64_t                     *remote_addresses;
+    size_t                       *lengths;
+    ucp_device_flags_t           flags;
+};
+
+class ucp_perf_cuda_params_handler {
+public:
+    ucp_perf_cuda_params_handler(const ucx_perf_context_t &perf)
+    {
+        init_mem_list(perf);
+        init_elements(perf);
+        m_params.flags = UCP_DEVICE_FLAG_NODELAY;
+    }
+
+    ~ucp_perf_cuda_params_handler()
+    {
+        ucp_device_mem_list_release(m_params.mem_list);
+        CUDA_CALL_WARN(cudaFree, m_params.indices);
+        CUDA_CALL_WARN(cudaFree, m_params.addresses);
+        CUDA_CALL_WARN(cudaFree, m_params.remote_addresses);
+        CUDA_CALL_WARN(cudaFree, m_params.lengths);
+    }
+
+    const ucp_perf_cuda_params &get_params() const { return m_params; }
+
+private:
+    void init_mem_list(const ucx_perf_context_t &perf)
+    {
+        size_t count = perf.params.msg_size_cnt;
+        ucp_device_mem_list_elem_t elems[count];
+        for (size_t i = 0; i < count; ++i) {
+            elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
+                                  UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY;
+            elems[i].memh       = perf.ucp.send_memh;
+            elems[i].rkey       = perf.ucp.rkey;
+        }
+
+        ucp_device_mem_list_params_t params;
+        params.field_mask   = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
+                              UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
+                              UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS;
+        params.element_size = sizeof(ucp_device_mem_list_elem_t);
+        params.num_elements = count;
+        params.elements     = elems;
+
+        ucs_status_t status = ucp_device_mem_list_create(perf.ucp.ep, &params,
+                                                         &m_params.mem_list);
+        if (status != UCS_OK) {
+            throw std::runtime_error("Failed to create memory list");
+        }
+    }
+
+    void init_elements(const ucx_perf_context_t &perf)
+    {
+        size_t count = perf.params.msg_size_cnt;
+
+        std::vector<unsigned> indices(count);
+        std::vector<void*> addresses(count);
+        std::vector<uint64_t> remote_addresses(count);
+        std::vector<size_t> lengths(count);
+        for (unsigned i = 0, offset = 0; i < count; ++i) {
+            indices[i]          = i;
+            addresses[i]        = (char *)perf.send_buffer + offset;
+            remote_addresses[i] = perf.ucp.remote_addr + offset;
+            lengths[i]          = perf.params.msg_size_list[i];
+            offset             += lengths[i];
+        }
+
+        device_clone(&m_params.indices, indices.data(), count);
+        device_clone(&m_params.addresses, addresses.data(), count);
+        device_clone(&m_params.remote_addresses, remote_addresses.data(), count);
+        device_clone(&m_params.lengths, lengths.data(), count);
+    }
+
+    template<typename T>
+    void device_clone(T **dst, const T *src, size_t count)
+    {
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaMalloc, dst, count * sizeof(T));
+        CUDA_CALL_ERR(cudaMemcpy, *dst, src, count * sizeof(T),
+                      cudaMemcpyHostToDevice);
+    }
+
+    ucp_perf_cuda_params m_params;
+};
+
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 UCS_F_DEVICE ucs_status_t
-ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list,
-                       ucx_perf_cuda_element_list &element_list,
-                       ucp_device_request_t &req)
+ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucp_device_request_t &req)
 {
     switch (cmd) {
     case UCX_PERF_CMD_PUT_SINGLE:
-        return ucp_device_put_single<level>(mem_list, element_list.m_indices[0],
-                                            element_list.m_addresses[0],
-                                            element_list.m_remote_addresses[0],
-                                            element_list.m_lengths[0], 0, &req);
+        return ucp_device_put_single<level>(params.mem_list, params.indices[0],
+                                            params.addresses[0],
+                                            params.remote_addresses[0],
+                                            params.lengths[0], params.flags,
+                                            &req);
     // case UCX_PERF_CMD_PUT_MULTI:
     //     return ucp_device_put_multi<level>(mem_list, element_list->elements,
     //                                        element_list->count, 0, &req);
@@ -98,12 +186,10 @@ ucp_perf_cuda_send_nbx(ucp_device_mem_list_handle_h mem_list,
 
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 UCS_F_DEVICE ucs_status_t
-ucp_perf_cuda_send_sync(ucp_device_mem_list_handle_h mem_list,
-                        ucx_perf_cuda_element_list &element_list)
+ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params)
 {
     ucp_device_request_t req;
-    ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(mem_list,
-                                                             element_list, req);
+    ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(params, req);
     if (status != UCS_OK) {
         return status;
     }
@@ -118,8 +204,7 @@ ucp_perf_cuda_send_sync(ucp_device_mem_list_handle_h mem_list,
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
-                                  ucp_device_mem_list_handle_h mem_list,
-                                  ucx_perf_cuda_element_list &element_list,
+                                  ucp_perf_cuda_params params,
                                   const void *address, size_t length)
 {
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
@@ -138,7 +223,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 
         *sn = idx + 1;
         ucp_device_request_t &req = request_mgr.get_request();
-        status = ucp_perf_cuda_send_nbx<level, cmd>(mem_list, element_list, req);
+        status = ucp_perf_cuda_send_nbx<level, cmd>(params, req);
         if (status != UCS_OK) {
             break;
         }
@@ -160,8 +245,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
-                                       ucp_device_mem_list_handle_h mem_list,
-                                       ucx_perf_cuda_element_list &element_list,
+                                       ucp_perf_cuda_params params,
                                        const void *address,
                                        size_t length, const void *recv_address,
                                        bool is_sender)
@@ -175,7 +259,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         if (is_sender) {
             *sn    = idx + 1;
-            status = ucp_perf_cuda_send_sync<level, cmd>(mem_list, element_list);
+            status = ucp_perf_cuda_send_sync<level, cmd>(params);
             if (status != UCS_OK) {
                 break;
             }
@@ -183,7 +267,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
         } else {
             ucx_perf_cuda_wait_sn(recv_sn, idx + 1);
             *sn    = idx + 1;
-            status = ucp_perf_cuda_send_sync<level, cmd>(mem_list, element_list);
+            status = ucp_perf_cuda_send_sync<level, cmd>(params);
             if (status != UCS_OK) {
                 break;
             }
@@ -223,22 +307,13 @@ public:
     {
         size_t length     = ucx_perf_get_message_size(&m_perf.params);
         unsigned my_index = rte_call(&m_perf, group_index);
-
-        unique_mem_list_ptr handle = create_mem_list();
-        if (!handle) {
-            return UCS_ERR_NO_MEMORY;
-        }
-
-        unique_element_list_ptr element_list = create_element_list();
-        if (!element_list) {
-            return UCS_ERR_NO_MEMORY;
-        }
+        ucp_perf_cuda_params_handler params_handler(m_perf);
 
         ucp_perf_barrier(&m_perf);
         ucx_perf_test_start_clock(&m_perf);
 
         UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel,
-                            gpu_ctx(), handle.get(), *element_list, m_perf.send_buffer,
+                            gpu_ctx(), params_handler.get_params(), m_perf.send_buffer,
                             length, m_perf.recv_buffer, my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
 
@@ -256,27 +331,14 @@ public:
         size_t length     = ucx_perf_get_message_size(&m_perf.params);
         unsigned my_index = rte_call(&m_perf, group_index);
 
-        unique_mem_list_ptr handle(nullptr, nullptr);
-        unique_element_list_ptr element_list;
-
-        if (my_index == 1) {
-            handle = create_mem_list();
-            if (!handle) {
-                return UCS_ERR_NO_MEMORY;
-            }
-
-            element_list = create_element_list();
-            if (!element_list) {
-                return UCS_ERR_NO_MEMORY;
-            }
-        }
+        ucp_perf_cuda_params_handler params_handler(m_perf);
 
         ucp_perf_barrier(&m_perf);
         ucx_perf_test_start_clock(&m_perf);
 
         if (my_index == 1) {
             UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel,
-                                gpu_ctx(), handle.get(), *element_list,
+                                gpu_ctx(), params_handler.get_params(),
                                 m_perf.send_buffer, length);
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
             wait_for_kernel(length);
@@ -290,72 +352,6 @@ public:
         ucp_perf_barrier(&m_perf);
         return UCS_OK;
     }
-
-private:
-    using unique_mem_list_ptr =
-        std::unique_ptr<struct ucp_device_mem_list_handle,
-                        decltype(&ucp_device_mem_list_release)>;
-
-    unique_mem_list_ptr create_mem_list() const
-    {
-        size_t count = m_perf.params.msg_size_cnt;
-        ucp_device_mem_list_elem_t elems[count];
-        for (size_t i = 0; i < count; ++i) {
-            elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
-                                  UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY;
-            elems[i].memh       = m_perf.ucp.send_memh;
-            elems[i].rkey       = m_perf.ucp.rkey;
-        }
-
-        ucp_device_mem_list_params_t params;
-        params.field_mask   = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
-                              UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
-                              UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS;
-        params.element_size = sizeof(ucp_device_mem_list_elem_t);
-        params.num_elements = count;
-        params.elements     = elems;
-
-        ucp_device_mem_list_handle_h mem_list;
-        ucs_status_t status = ucp_device_mem_list_create(m_perf.ucp.ep, &params,
-                                                         &mem_list);
-        if (status != UCS_OK) {
-            return unique_mem_list_ptr(nullptr, nullptr);
-        }
-
-        return unique_mem_list_ptr(mem_list, ucp_device_mem_list_release);
-    }
-
-    using unique_element_list_ptr = std::unique_ptr<ucx_perf_cuda_element_list>;
-
-    unique_element_list_ptr create_element_list() const
-    {
-        size_t count = m_perf.params.msg_size_cnt;
-        unique_element_list_ptr element_list =
-            unique_element_list_ptr(new ucx_perf_cuda_element_list(count));
-
-        std::vector<unsigned> indices(count);
-        std::vector<void*> addresses(count);
-        std::vector<uint64_t> remote_addresses(count);
-        std::vector<size_t> lengths(count);
-        for (unsigned i = 0, offset = 0; i < count; ++i) {
-            indices[i]          = i;
-            addresses[i]        = (char *)m_perf.send_buffer + offset;
-            remote_addresses[i] = m_perf.ucp.remote_addr + offset;
-            lengths[i]          = m_perf.params.msg_size_list[i];
-            offset             += lengths[i];
-        }
-
-        CUDA_CALL_ERR(cudaMemcpy, element_list->m_indices, indices.data(),
-                      count * sizeof(unsigned), cudaMemcpyHostToDevice);
-        CUDA_CALL_ERR(cudaMemcpy, element_list->m_addresses, addresses.data(),
-                      count * sizeof(void*), cudaMemcpyHostToDevice);
-        CUDA_CALL_ERR(cudaMemcpy, element_list->m_remote_addresses,
-                      remote_addresses.data(), count * sizeof(uint64_t),
-                      cudaMemcpyHostToDevice);
-        CUDA_CALL_ERR(cudaMemcpy, element_list->m_lengths, lengths.data(),
-                      count * sizeof(size_t), cudaMemcpyHostToDevice);
-        return element_list;
-    }
 };
 
 ucx_perf_device_dispatcher_t ucx_perf_cuda_dispatcher;

From 74f4b692484476f6e9085e06f10c2e50ad1cf3bb Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 08:02:09 +0000
Subject: [PATCH 12/32] UCP/PERF: Report kernel status

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  7 ++-----
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 23 +++++++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index f24a4ac568b..07b4412814d 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -153,8 +153,6 @@ public:
         CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx);
     }
 
-    ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; }
-
     void wait_for_kernel(size_t msg_length)
     {
         ucx_perf_counter_t last_completed = 0;
@@ -176,6 +174,8 @@ public:
 
 protected:
     ucx_perf_context_t &m_perf;
+    ucx_perf_cuda_context *m_cpu_ctx;
+    ucx_perf_cuda_context *m_gpu_ctx;
 
 private:
     void init_ctx()
@@ -185,9 +185,6 @@ private:
         CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
                   &m_gpu_ctx, m_cpu_ctx, 0);
     }
-
-    ucx_perf_cuda_context *m_cpu_ctx;
-    ucx_perf_cuda_context *m_gpu_ctx;
 };
 
 
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 0476262f892..72017b47757 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -204,12 +204,10 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params)
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
-                                  ucp_perf_cuda_params params,
-                                  const void *address, size_t length)
+                                  ucp_perf_cuda_params params)
 {
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
-    uint64_t *sn                          = ucx_perf_cuda_get_sn(address, length);
     ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding);
     ucs_status_t status;
 
@@ -221,7 +219,13 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
             }
         }
 
-        *sn = idx + 1;
+        /* TODO: Change to ucp_device_counter_write */
+        if ((cmd == UCX_PERF_CMD_PUT_SINGLE) && (idx == max_iters - 1)) {
+            uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0],
+                                                params.lengths[0]);
+            *sn = idx + 1;
+        }
+
         ucp_device_request_t &req = request_mgr.get_request();
         status = ucp_perf_cuda_send_nbx<level, cmd>(params, req);
         if (status != UCS_OK) {
@@ -313,7 +317,7 @@ public:
         ucx_perf_test_start_clock(&m_perf);
 
         UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel,
-                            gpu_ctx(), params_handler.get_params(), m_perf.send_buffer,
+                            *m_gpu_ctx, params_handler.get_params(), m_perf.send_buffer,
                             length, m_perf.recv_buffer, my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
 
@@ -323,7 +327,7 @@ public:
 
         ucx_perf_get_time(&m_perf);
         ucp_perf_barrier(&m_perf);
-        return UCS_OK;
+        return m_cpu_ctx->status;
     }
 
     ucs_status_t run_stream_uni()
@@ -338,19 +342,18 @@ public:
 
         if (my_index == 1) {
             UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel,
-                                gpu_ctx(), params_handler.get_params(),
-                                m_perf.send_buffer, length);
+                                *m_gpu_ctx, params_handler.get_params());
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
             wait_for_kernel(length);
         } else if (my_index == 0) {
             ucp_perf_cuda_wait_multi_bw_kernel<<<1, 1>>>(
-                    gpu_ctx(), m_perf.recv_buffer, length);
+                    *m_gpu_ctx, m_perf.recv_buffer, length);
         }
 
         CUDA_CALL_RET(UCS_ERR_IO_ERROR, cudaDeviceSynchronize);
         ucx_perf_get_time(&m_perf);
         ucp_perf_barrier(&m_perf);
-        return UCS_OK;
+        return m_cpu_ctx->status;
     }
 };
 

From 4015d74d203d3e7e1ee7fbc90a07baed35df8488 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 08:22:16 +0000
Subject: [PATCH 13/32] UCP/PERF: Reduce number of kernel args

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  3 +-
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 41 ++++++++++++--------------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 07b4412814d..6c6e192d3ab 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -153,8 +153,9 @@ public:
         CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx);
     }
 
-    void wait_for_kernel(size_t msg_length)
+    void wait_for_kernel()
     {
+        size_t msg_length                 = ucx_perf_get_message_size(&m_perf.params);
         ucx_perf_counter_t last_completed = 0;
         ucx_perf_counter_t completed      = m_cpu_ctx->completed_iters;
         while (1) {
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 72017b47757..e64ee5fdc5f 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -164,15 +164,20 @@ private:
 
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 UCS_F_DEVICE ucs_status_t
-ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucp_device_request_t &req)
+ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
+                       ucp_device_request_t &req)
 {
     switch (cmd) {
-    case UCX_PERF_CMD_PUT_SINGLE:
+    case UCX_PERF_CMD_PUT_SINGLE: {
+        /* TODO: Change to ucp_device_counter_write */
+        uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], params.lengths[0]);
+        *sn = idx + 1;
         return ucp_device_put_single<level>(params.mem_list, params.indices[0],
                                             params.addresses[0],
                                             params.remote_addresses[0],
                                             params.lengths[0], params.flags,
                                             &req);
+    }
     // case UCX_PERF_CMD_PUT_MULTI:
     //     return ucp_device_put_multi<level>(mem_list, element_list->elements,
     //                                        element_list->count, 0, &req);
@@ -186,10 +191,10 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucp_device_request_t &req)
 
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 UCS_F_DEVICE ucs_status_t
-ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params)
+ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params, ucx_perf_counter_t idx)
 {
     ucp_device_request_t req;
-    ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(params, req);
+    ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
     if (status != UCS_OK) {
         return status;
     }
@@ -219,15 +224,8 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
             }
         }
 
-        /* TODO: Change to ucp_device_counter_write */
-        if ((cmd == UCX_PERF_CMD_PUT_SINGLE) && (idx == max_iters - 1)) {
-            uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0],
-                                                params.lengths[0]);
-            *sn = idx + 1;
-        }
-
         ucp_device_request_t &req = request_mgr.get_request();
-        status = ucp_perf_cuda_send_nbx<level, cmd>(params, req);
+        status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
         if (status != UCS_OK) {
             break;
         }
@@ -250,20 +248,19 @@ template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
                                        ucp_perf_cuda_params params,
-                                       const void *address,
-                                       size_t length, const void *recv_address,
+                                       const void *recv_address, size_t length,
                                        bool is_sender)
 {
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
-    uint64_t *sn                          = ucx_perf_cuda_get_sn(address, length);
+    uint64_t *sn                          = ucx_perf_cuda_get_sn(params.addresses[0], length);
     uint64_t *recv_sn                     = ucx_perf_cuda_get_sn(recv_address, length);
     ucs_status_t status                   = UCS_OK;
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         if (is_sender) {
             *sn    = idx + 1;
-            status = ucp_perf_cuda_send_sync<level, cmd>(params);
+            status = ucp_perf_cuda_send_sync<level, cmd>(params, idx);
             if (status != UCS_OK) {
                 break;
             }
@@ -271,7 +268,7 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
         } else {
             ucx_perf_cuda_wait_sn(recv_sn, idx + 1);
             *sn    = idx + 1;
-            status = ucp_perf_cuda_send_sync<level, cmd>(params);
+            status = ucp_perf_cuda_send_sync<level, cmd>(params, idx);
             if (status != UCS_OK) {
                 break;
             }
@@ -317,11 +314,11 @@ public:
         ucx_perf_test_start_clock(&m_perf);
 
         UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel,
-                            *m_gpu_ctx, params_handler.get_params(), m_perf.send_buffer,
-                            length, m_perf.recv_buffer, my_index);
+                            *m_gpu_ctx, params_handler.get_params(),
+                            m_perf.recv_buffer, length, my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
 
-        wait_for_kernel(length);
+        wait_for_kernel();
 
         CUDA_CALL_RET(UCS_ERR_IO_ERROR, cudaDeviceSynchronize);
 
@@ -332,7 +329,6 @@ public:
 
     ucs_status_t run_stream_uni()
     {
-        size_t length     = ucx_perf_get_message_size(&m_perf.params);
         unsigned my_index = rte_call(&m_perf, group_index);
 
         ucp_perf_cuda_params_handler params_handler(m_perf);
@@ -344,8 +340,9 @@ public:
             UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_bw_kernel,
                                 *m_gpu_ctx, params_handler.get_params());
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
-            wait_for_kernel(length);
+            wait_for_kernel();
         } else if (my_index == 0) {
+            size_t length = ucx_perf_get_message_size(&m_perf.params);
             ucp_perf_cuda_wait_multi_bw_kernel<<<1, 1>>>(
                     *m_gpu_ctx, m_perf.recv_buffer, length);
         }

From 39188ba52fce384ddec31474d24f8f81c12e39db Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 11:35:53 +0000
Subject: [PATCH 14/32] UCP/PERF: Progress fix from Thomas

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  4 ++--
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 6c6e192d3ab..dca1942d7ae 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -81,9 +81,9 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
     return count;
 }
 
-UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {
+UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from) {
     for (size_t i = from; i < bits; i++) {
-        if (UCX_BIT_GET(set, i)) {
+        if (!UCX_BIT_GET(set, i)) {
             return i;
         }
     }
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index e64ee5fdc5f..8d690f9e0f0 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -54,7 +54,7 @@ public:
     __device__ ucp_device_request_t &get_request()
     {
         assert(get_pending_count() < m_size);
-        size_t index = ucx_bitset_ffs(m_pending, m_size, 0);
+        size_t index = ucx_bitset_ffns(m_pending, m_size, 0);
         UCX_BIT_SET(m_pending, index);
         return m_requests[index];
     }
@@ -220,15 +220,17 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
         while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
             status = request_mgr.progress<level>(1);
             if (status != UCS_OK) {
-                break;
+                continue;
             }
         }
 
         ucp_device_request_t &req = request_mgr.get_request();
-        status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
-        if (status != UCS_OK) {
-            break;
-        }
+        do {
+            status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
+            if (status == UCS_ERR_NO_RESOURCE) {
+                request_mgr.progress<level>(1);
+            }
+        } while (status == UCS_ERR_NO_RESOURCE);
 
         ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time);
         __syncthreads();

From 433c48d2c434d63f27277be9fc8ec7ec5c7208e3 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 11:37:45 +0000
Subject: [PATCH 15/32] UCP/PERF: TODO comment

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 8d690f9e0f0..b32a601d7ae 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -225,6 +225,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
         }
 
         ucp_device_request_t &req = request_mgr.get_request();
+        // TODO: remove loop once API is changed
         do {
             status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
             if (status == UCS_ERR_NO_RESOURCE) {

From dc25dd4b9ac78f2716d0d43396d019a20389336c Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 12:10:01 +0000
Subject: [PATCH 16/32] UCP/PERF: Counters in params

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  7 ++--
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 45 ++++++++++++++------------
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index dca1942d7ae..5a31d3dac41 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -51,15 +51,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
     }
 }
 
-UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length)
+static UCS_F_ALWAYS_INLINE uint64_t *
+ucx_perf_cuda_get_sn(const void *address, size_t length)
 {
     return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t));
 }
 
-UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value)
+UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
 {
     if (threadIdx.x == 0) {
-        while (*sn < value);
+        while (ucs_device_atomic64_read(sn) < value);
     }
     __syncthreads();
 }
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index b32a601d7ae..a0c009c9b6e 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -79,6 +79,9 @@ struct ucp_perf_cuda_params {
     void                         **addresses;
     uint64_t                     *remote_addresses;
     size_t                       *lengths;
+    uint64_t                     counter_remote;
+    uint64_t                     *counter_send;
+    uint64_t                     *counter_recv;
     ucp_device_flags_t           flags;
 };
 
@@ -88,7 +91,7 @@ public:
     {
         init_mem_list(perf);
         init_elements(perf);
-        m_params.flags = UCP_DEVICE_FLAG_NODELAY;
+        init_counters(perf);
     }
 
     ~ucp_perf_cuda_params_handler()
@@ -151,6 +154,16 @@ private:
         device_clone(&m_params.lengths, lengths.data(), count);
     }
 
+    void init_counters(const ucx_perf_context_t &perf)
+    {
+        size_t length           = ucx_perf_get_message_size(&perf.params);
+        m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn(
+                                        (void*)perf.ucp.remote_addr, length);
+        m_params.counter_send   = ucx_perf_cuda_get_sn(perf.send_buffer, length);
+        m_params.counter_recv   = ucx_perf_cuda_get_sn(perf.recv_buffer, length);
+        m_params.flags          = UCP_DEVICE_FLAG_NODELAY;
+    }
+
     template<typename T>
     void device_clone(T **dst, const T *src, size_t count)
     {
@@ -168,16 +181,14 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
                        ucp_device_request_t &req)
 {
     switch (cmd) {
-    case UCX_PERF_CMD_PUT_SINGLE: {
+    case UCX_PERF_CMD_PUT_SINGLE:
         /* TODO: Change to ucp_device_counter_write */
-        uint64_t *sn = ucx_perf_cuda_get_sn(params.addresses[0], params.lengths[0]);
-        *sn = idx + 1;
+        *params.counter_send = idx + 1;
         return ucp_device_put_single<level>(params.mem_list, params.indices[0],
                                             params.addresses[0],
                                             params.remote_addresses[0],
                                             params.lengths[0], params.flags,
                                             &req);
-    }
     // case UCX_PERF_CMD_PUT_MULTI:
     //     return ucp_device_put_multi<level>(mem_list, element_list->elements,
     //                                        element_list->count, 0, &req);
@@ -239,7 +250,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 
     while (request_mgr.get_pending_count() > 0) {
         status = request_mgr.progress<level>(max_iters);
-        if (status != UCS_OK) {
+        if (UCS_STATUS_IS_ERR(status)) {
             break;
         }
     }
@@ -251,26 +262,21 @@ template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 __global__ void
 ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
                                        ucp_perf_cuda_params params,
-                                       const void *recv_address, size_t length,
                                        bool is_sender)
 {
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
-    uint64_t *sn                          = ucx_perf_cuda_get_sn(params.addresses[0], length);
-    uint64_t *recv_sn                     = ucx_perf_cuda_get_sn(recv_address, length);
     ucs_status_t status                   = UCS_OK;
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         if (is_sender) {
-            *sn    = idx + 1;
             status = ucp_perf_cuda_send_sync<level, cmd>(params, idx);
             if (status != UCS_OK) {
                 break;
             }
-            ucx_perf_cuda_wait_sn(recv_sn, idx + 1);
+            ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
         } else {
-            ucx_perf_cuda_wait_sn(recv_sn, idx + 1);
-            *sn    = idx + 1;
+            ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
             status = ucp_perf_cuda_send_sync<level, cmd>(params, idx);
             if (status != UCS_OK) {
                 break;
@@ -286,9 +292,10 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
 
 __global__ void
 ucp_perf_cuda_wait_multi_bw_kernel(ucx_perf_cuda_context &ctx,
-                                   const void *address, size_t length)
+                                   ucp_perf_cuda_params params)
 {
-    volatile uint64_t *sn = ucx_perf_cuda_get_sn(address, length);
+    // TODO: we can use ucp_device_counter_read, but it adds latency
+    volatile uint64_t *sn = params.counter_recv;
     while (*sn < ctx.max_iters) {
         __nanosleep(100000); // 100us
     }
@@ -309,7 +316,6 @@ public:
 
     ucs_status_t run_pingpong()
     {
-        size_t length     = ucx_perf_get_message_size(&m_perf.params);
         unsigned my_index = rte_call(&m_perf, group_index);
         ucp_perf_cuda_params_handler params_handler(m_perf);
 
@@ -317,8 +323,7 @@ public:
         ucx_perf_test_start_clock(&m_perf);
 
         UCX_KERNEL_DISPATCH(m_perf, ucp_perf_cuda_put_multi_latency_kernel,
-                            *m_gpu_ctx, params_handler.get_params(),
-                            m_perf.recv_buffer, length, my_index);
+                            *m_gpu_ctx, params_handler.get_params(), my_index);
         CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
 
         wait_for_kernel();
@@ -333,7 +338,6 @@ public:
     ucs_status_t run_stream_uni()
     {
         unsigned my_index = rte_call(&m_perf, group_index);
-
         ucp_perf_cuda_params_handler params_handler(m_perf);
 
         ucp_perf_barrier(&m_perf);
@@ -345,9 +349,8 @@ public:
             CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
             wait_for_kernel();
         } else if (my_index == 0) {
-            size_t length = ucx_perf_get_message_size(&m_perf.params);
             ucp_perf_cuda_wait_multi_bw_kernel<<<1, 1>>>(
-                    *m_gpu_ctx, m_perf.recv_buffer, length);
+                    *m_gpu_ctx, params_handler.get_params());
         }
 
         CUDA_CALL_RET(UCS_ERR_IO_ERROR, cudaDeviceSynchronize);

From 881d9ce95af440906b85917debc8314c943eb7e4 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 12:15:08 +0000
Subject: [PATCH 17/32] UCP/PERF: put multi

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index a0c009c9b6e..1fd8645e006 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -189,9 +189,12 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
                                             params.remote_addresses[0],
                                             params.lengths[0], params.flags,
                                             &req);
-    // case UCX_PERF_CMD_PUT_MULTI:
-    //     return ucp_device_put_multi<level>(mem_list, element_list->elements,
-    //                                        element_list->count, 0, &req);
+    case UCX_PERF_CMD_PUT_MULTI:
+        return ucp_device_put_multi<level>(params.mem_list, params.addresses,
+                                           params.remote_addresses,
+                                           params.lengths, 1,
+                                           params.counter_remote, params.flags,
+                                           &req);
     // case UCX_PERF_CMD_PUT_PARTIAL:
     //     return ucp_device_put_partial<level>(mem_list, element_list->elements,
     //                                         element_list->count, 0, &req);

From 6276578dad5c1727958f444bea4ab90f12265295 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 12:28:07 +0000
Subject: [PATCH 18/32] UCP/PERF: put partial

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 13 ++++++++++---
 src/ucp/api/device/ucp_device_impl.h   |  1 +
 src/uct/api/device/uct_device_impl.h   |  1 -
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 1fd8645e006..83e17283833 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -195,9 +195,16 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
                                            params.lengths, 1,
                                            params.counter_remote, params.flags,
                                            &req);
-    // case UCX_PERF_CMD_PUT_PARTIAL:
-    //     return ucp_device_put_partial<level>(mem_list, element_list->elements,
-    //                                         element_list->count, 0, &req);
+    case UCX_PERF_CMD_PUT_PARTIAL:
+        return ucp_device_put_multi_partial<level>(params.mem_list,
+                                                   params.indices,
+                                                   params.mem_list->mem_list_length,
+                                                   params.addresses,
+                                                   params.remote_addresses,
+                                                   params.lengths,
+                                                   params.mem_list->mem_list_length,
+                                                   1, params.counter_remote,
+                                                   params.flags, &req);
     }
 
     return UCS_ERR_INVALID_PARAM;
diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h
index cad60dcd186..f82725f7576 100644
--- a/src/ucp/api/device/ucp_device_impl.h
+++ b/src/ucp/api/device/ucp_device_impl.h
@@ -307,6 +307,7 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
  * @param [in]  addresses              Array of local addresses to send from.
  * @param [in]  remote_addresses       Array of remote addresses to send to.
  * @param [in]  lengths                Array of lengths in bytes for each send.
+ * @param [in]  counter_index          Index of remote increment descriptor.
  * @param [in]  counter_inc_value      Value of the remote increment.
  * @param [in]  counter_remote_address Remote address to increment to.
  * @param [in]  flags                  Flags to modify the function behavior.
diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h
index 648cab75c36..0271e923dbd 100644
--- a/src/uct/api/device/uct_device_impl.h
+++ b/src/uct/api/device/uct_device_impl.h
@@ -124,7 +124,6 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add(
  * @param [in]  addresses              Array of local addresses to send from.
  * @param [in]  remote_addresses       Array of remote addresses to send to.
  * @param [in]  lengths                Array of lengths in bytes for each send.
- * @param [in]  counter_index          Index of remote increment descriptor.
  * @param [in]  counter_inc_value      Value of the remote increment.
  * @param [in]  counter_remote_address Remote address to increment to.
  * @param [in]  flags                  Flags to modify the function behavior.

From e235a43a3cdce78a9d6d5245ea44ae28bb90ccd2 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 12:40:35 +0000
Subject: [PATCH 19/32] UCP/PERF: Minor changes

---
 src/tools/perf/cuda/cuda_kernel.cuh    | 2 --
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 5a31d3dac41..6f0e44f8fdf 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -13,8 +13,6 @@
 #include <ucs/sys/device_code.h>
 #include <cuda_runtime.h>
 
-#include <memory>
-
 
 typedef unsigned long long ucx_perf_cuda_time_t;
 
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 83e17283833..182639d1f0f 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -14,6 +14,7 @@
 #include <tools/perf/lib/ucp_tests.h>
 
 #include <vector>
+#include <stdexcept>
 
 
 class ucp_perf_cuda_request_manager {

From b8fe11f22ea757bb02e89126defdf82f1df80f2b Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 13:06:14 +0000
Subject: [PATCH 20/32] UCP/PERF: Minor changes

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 182639d1f0f..e428a74c610 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -223,7 +223,8 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params, ucx_perf_counter_t idx)
 
     do {
         status = ucp_device_progress_req<level>(&req);
-    } while (status == UCS_INPROGRESS);
+        // TODO: remove NO_RESOURCE
+    } while ((status == UCS_INPROGRESS) || (status == UCS_ERR_NO_RESOURCE));
 
     return status;
 }
@@ -240,9 +241,10 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
-            status = request_mgr.progress<level>(1);
-            if (status != UCS_OK) {
-                continue;
+            request_mgr.progress<level>(1);
+            if (UCS_STATUS_IS_ERR(status)) {
+                ctx.status = status;
+                return;
             }
         }
 

From 28454a4beb6fca1c04d42eba81aad2f6166ec7c3 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 13:54:25 +0000
Subject: [PATCH 21/32] UCP/PERF: Separate element for counter

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  2 +-
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 19 ++++++++++++-------
 src/tools/perf/lib/libperf_int.h       |  1 +
 src/tools/perf/lib/libperf_memory.c    |  6 ++++--
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 6f0e44f8fdf..6a6f5b7b9da 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -52,7 +52,7 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
 static UCS_F_ALWAYS_INLINE uint64_t *
 ucx_perf_cuda_get_sn(const void *address, size_t length)
 {
-    return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t));
+    return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length);
 }
 
 UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index e428a74c610..db7c1d75746 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -109,7 +109,8 @@ public:
 private:
     void init_mem_list(const ucx_perf_context_t &perf)
     {
-        size_t count = perf.params.msg_size_cnt;
+        /* +1 for the counter */
+        size_t count = perf.params.msg_size_cnt + 1;
         ucp_device_mem_list_elem_t elems[count];
         for (size_t i = 0; i < count; ++i) {
             elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
@@ -135,7 +136,8 @@ private:
 
     void init_elements(const ucx_perf_context_t &perf)
     {
-        size_t count = perf.params.msg_size_cnt;
+        /* +1 for the counter */
+        size_t count = perf.params.msg_size_cnt + 1;
 
         std::vector<unsigned> indices(count);
         std::vector<void*> addresses(count);
@@ -145,7 +147,8 @@ private:
             indices[i]          = i;
             addresses[i]        = (char *)perf.send_buffer + offset;
             remote_addresses[i] = perf.ucp.remote_addr + offset;
-            lengths[i]          = perf.params.msg_size_list[i];
+            lengths[i]          = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
+                                                     perf.params.msg_size_list[i];
             offset             += lengths[i];
         }
 
@@ -196,16 +199,18 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
                                            params.lengths, 1,
                                            params.counter_remote, params.flags,
                                            &req);
-    case UCX_PERF_CMD_PUT_PARTIAL:
+    case UCX_PERF_CMD_PUT_PARTIAL:{
+        unsigned counter_index = params.mem_list->mem_list_length - 1;
         return ucp_device_put_multi_partial<level>(params.mem_list,
                                                    params.indices,
-                                                   params.mem_list->mem_list_length,
+                                                   counter_index,
                                                    params.addresses,
                                                    params.remote_addresses,
                                                    params.lengths,
-                                                   params.mem_list->mem_list_length,
-                                                   1, params.counter_remote,
+                                                   counter_index, 1,
+                                                   params.counter_remote,
                                                    params.flags, &req);
+        }
     }
 
     return UCS_ERR_INVALID_PARAM;
diff --git a/src/tools/perf/lib/libperf_int.h b/src/tools/perf/lib/libperf_int.h
index 755a1c839e3..3b40ab9d9b7 100644
--- a/src/tools/perf/lib/libperf_int.h
+++ b/src/tools/perf/lib/libperf_int.h
@@ -30,6 +30,7 @@ BEGIN_C_DECLS
 #define UCT_PERF_TEST_AM_ID  5
 #define ADDR_BUF_SIZE        4096
 #define EXTRA_INFO_SIZE      256
+#define ONESIDED_SIGNAL_SIZE sizeof(uint64_t)
 
 #define UCX_PERF_TEST_FOREACH(perf) \
     while (!ucx_perf_context_done(perf))
diff --git a/src/tools/perf/lib/libperf_memory.c b/src/tools/perf/lib/libperf_memory.c
index 0b2a14c1e5f..8fb5a1bb9e5 100644
--- a/src/tools/perf/lib/libperf_memory.c
+++ b/src/tools/perf/lib/libperf_memory.c
@@ -112,7 +112,8 @@ ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf)
     }
 
     /* Allocate send buffer memory */
-    status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count,
+    status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count +
+                                ONESIDED_SIGNAL_SIZE,
                                 params->send_mem_type, &perf->send_buffer,
                                 &perf->ucp.send_memh);
     if (status != UCS_OK) {
@@ -125,7 +126,8 @@ ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf)
     perf->ucp.recv_exported_mem.address = NULL;
 
     /* Allocate receive buffer memory */
-    status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count,
+    status = ucp_perf_mem_alloc(perf, buffer_size * params->thread_count +
+                                ONESIDED_SIGNAL_SIZE,
                                 params->recv_mem_type, &perf->recv_buffer,
                                 &perf->ucp.recv_memh);
     if (status != UCS_OK) {

From 09a7201fd391d52a95f0b790835320786df0ed9e Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 14:12:30 +0000
Subject: [PATCH 22/32] UCP/PERF: Temporary fis for single, until counter write
 is merged

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index db7c1d75746..2e748104876 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -191,8 +191,8 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
         return ucp_device_put_single<level>(params.mem_list, params.indices[0],
                                             params.addresses[0],
                                             params.remote_addresses[0],
-                                            params.lengths[0], params.flags,
-                                            &req);
+                                            params.lengths[0] + ONESIDED_SIGNAL_SIZE,
+                                            params.flags, &req);
     case UCX_PERF_CMD_PUT_MULTI:
         return ucp_device_put_multi<level>(params.mem_list, params.addresses,
                                            params.remote_addresses,

From 99245048d9842e51332a8db34538ac6723265932 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 15:02:52 +0000
Subject: [PATCH 23/32] UCP/PERF: Fixed memory corruption

---
 src/tools/perf/perftest.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c
index 00d782e5632..7da9fea02ed 100644
--- a/src/tools/perf/perftest.c
+++ b/src/tools/perf/perftest.c
@@ -465,8 +465,8 @@ static ucs_status_t setup_sock_rte_p2p(struct perftest_context *ctx)
 
         if (peer_params.super.msg_size_cnt != 0) {
             peer_params.super.msg_size_list =
-                    calloc(ctx->params.super.msg_size_cnt,
-                           sizeof(*ctx->params.super.msg_size_list));
+                    calloc(peer_params.super.msg_size_cnt,
+                           sizeof(*peer_params.super.msg_size_list));
             if (peer_params.super.msg_size_list == NULL) {
                 status = UCS_ERR_NO_MEMORY;
                 goto err_close_connfd;

From 24e20f86524ffcdc30276b030550efda76db9df6 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Thu, 18 Sep 2025 15:27:00 +0000
Subject: [PATCH 24/32] UCP/PERF: Send all elements in single

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 2e748104876..0f147026579 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -76,6 +76,7 @@ private:
 
 struct ucp_perf_cuda_params {
     ucp_device_mem_list_handle_h mem_list;
+    size_t                       length;
     unsigned                     *indices;
     void                         **addresses;
     uint64_t                     *remote_addresses;
@@ -160,11 +161,14 @@ private:
 
     void init_counters(const ucx_perf_context_t &perf)
     {
-        size_t length           = ucx_perf_get_message_size(&perf.params);
+        m_params.length         = ucx_perf_get_message_size(&perf.params);
         m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn(
-                                        (void*)perf.ucp.remote_addr, length);
-        m_params.counter_send   = ucx_perf_cuda_get_sn(perf.send_buffer, length);
-        m_params.counter_recv   = ucx_perf_cuda_get_sn(perf.recv_buffer, length);
+                                        (void*)perf.ucp.remote_addr,
+                                        m_params.length);
+        m_params.counter_send   = ucx_perf_cuda_get_sn(perf.send_buffer,
+                                                       m_params.length);
+        m_params.counter_recv   = ucx_perf_cuda_get_sn(perf.recv_buffer,
+                                                       m_params.length);
         m_params.flags          = UCP_DEVICE_FLAG_NODELAY;
     }
 
@@ -191,7 +195,7 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
         return ucp_device_put_single<level>(params.mem_list, params.indices[0],
                                             params.addresses[0],
                                             params.remote_addresses[0],
-                                            params.lengths[0] + ONESIDED_SIGNAL_SIZE,
+                                            params.length + ONESIDED_SIGNAL_SIZE,
                                             params.flags, &req);
     case UCX_PERF_CMD_PUT_MULTI:
         return ucp_device_put_multi<level>(params.mem_list, params.addresses,

From 8053fdd84149f77c996dcef373e64bcb6aac3c2d Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Fri, 19 Sep 2025 06:52:31 +0000
Subject: [PATCH 25/32] UCP/PERF: Precise doc on blocks param

---
 src/tools/perf/perftest_params.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tools/perf/perftest_params.c b/src/tools/perf/perftest_params.c
index 6c3210d645e..e24a0be5507 100644
--- a/src/tools/perf/perftest_params.c
+++ b/src/tools/perf/perftest_params.c
@@ -138,9 +138,11 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("\n");
     printf("  UCP only:\n");
     printf("     -T <threads>[:<blocks>]\n");
-    printf("                    number of threads in the test (%d)\n",
+    printf("                    number of threads in the test (%d).\n",
            ctx->params.super.thread_count);
     printf("                    blocks is optional, it corresponds to the number of device blocks\n");
+    printf("                    if blocks is specified, then threads value corresponds to the number\n");
+    printf("                    of device threads in each block\n");
     printf("     -M <thread>    thread support level for progress engine (single)\n");
     printf("                        single     - only the master thread can access\n");
     printf("                        serialized - one thread can access at a time\n");

From cd48e9e13a744769f3d48c8287fb8ff56c2b434f Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Fri, 19 Sep 2025 11:26:49 +0000
Subject: [PATCH 26/32] UCP/PERF: Exclude multi-thread tests from CI, add API
 tests

---
 contrib/ucx_perftest_config/test_types_ucp_device_cuda | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
index 3d73617525c..e984ea2b830 100644
--- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda
+++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
@@ -1,7 +1,9 @@
 #
 # UCP basic device cuda tests
 #
-ucp_device_cuda_bw_1k_1thread         -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
-ucp_device_cuda_bw_1k_128threads      -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 128
-ucp_device_cuda_lat_1k_1thread        -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
-ucp_device_cuda_lat_1k_128threads     -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 128
+ucp_device_cuda_single_bw_1k_1thread         -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
+ucp_device_cuda_single_lat_1k_1thread        -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
+ucp_device_cuda_multi_bw_1k_1thread          -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
+ucp_device_cuda_multi_lat_1k_1thread         -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000
+ucp_device_cuda_partial_bw_1k_1thread        -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000
+ucp_device_cuda_partial_lat_1k_1thread       -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000

From a01882335c2077dde47aa0107abcabbfd62cb880 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Mon, 22 Sep 2025 08:32:15 +0000
Subject: [PATCH 27/32] UCP/PERF: Fixed ucp_device_progress_req

---
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 4 +++-
 src/ucp/api/device/ucp_device_impl.h   | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 0f147026579..44d96585d98 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -250,8 +250,9 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
-            request_mgr.progress<level>(1);
+            status = request_mgr.progress<level>(1);
             if (UCS_STATUS_IS_ERR(status)) {
+                ucs_device_error("progress failed: %d", status);
                 ctx.status = status;
                 return;
             }
@@ -273,6 +274,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
     while (request_mgr.get_pending_count() > 0) {
         status = request_mgr.progress<level>(max_iters);
         if (UCS_STATUS_IS_ERR(status)) {
+            ucs_device_error("progress failed: %d", status);
             break;
         }
     }
diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h
index f82725f7576..08050f9ef3a 100644
--- a/src/ucp/api/device/ucp_device_impl.h
+++ b/src/ucp/api/device/ucp_device_impl.h
@@ -391,7 +391,12 @@ UCS_F_DEVICE ucs_status_t ucp_device_progress_req(ucp_device_request_t *req)
     }
 
     status = uct_device_ep_progress<level>(req->device_ep);
-    return (status != UCS_OK ? status : UCS_INPROGRESS);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    return (ucs_likely(req->comp.count == 0)) ? req->comp.status :
+                                                UCS_INPROGRESS;
 }
 
 #endif /* UCP_DEVICE_IMPL_H */

From 82b253d290fd767ab0aaae5f247230b2a1c48ca6 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Mon, 22 Sep 2025 10:05:33 +0000
Subject: [PATCH 28/32] UCP/PERF: Allocate requests in shared memory

---
 src/tools/perf/cuda/cuda_kernel.cuh    | 22 ++++++++++++++--------
 src/tools/perf/cuda/ucp_cuda_kernel.cu | 21 ++++++++++++---------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 6a6f5b7b9da..4f60e4f1e49 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -89,17 +89,17 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from
     return bits;
 }
 
-#define UCX_KERNEL_CMD(level, cmd, blocks, threads, func, ...) \
+#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
     do { \
         switch (cmd) { \
         case UCX_PERF_CMD_PUT_SINGLE: \
-            func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads>>>(__VA_ARGS__); \
+            func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
             break; \
         case UCX_PERF_CMD_PUT_MULTI: \
-            func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads>>>(__VA_ARGS__); \
+            func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
             break; \
         case UCX_PERF_CMD_PUT_PARTIAL: \
-            func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads>>>(__VA_ARGS__); \
+            func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
             break; \
         default: \
             ucs_error("Unsupported cmd: %d", cmd); \
@@ -113,18 +113,24 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from
         ucx_perf_cmd_t _cmd       = perf.params.command; \
         unsigned _blocks          = perf.params.device_block_count; \
         unsigned _threads         = perf.params.device_thread_count; \
+        size_t _shared_size       = _threads * perf.params.max_outstanding * \
+                                    sizeof(ucp_device_request_t); \
         switch (_level) { \
         case UCS_DEVICE_LEVEL_THREAD: \
-            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
+                           _shared_size, func, __VA_ARGS__); \
             break; \
         case UCS_DEVICE_LEVEL_WARP: \
-            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
+                           _shared_size, func, __VA_ARGS__); \
             break; \
         case UCS_DEVICE_LEVEL_BLOCK: \
-            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
+                           _shared_size, func, __VA_ARGS__); \
             break; \
         case UCS_DEVICE_LEVEL_GRID: \
-            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads, func, __VA_ARGS__); \
+            UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
+                           _shared_size, func, __VA_ARGS__); \
             break; \
         default: \
             ucs_error("Unsupported level: %d", _level); \
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 44d96585d98..5106512c7de 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -19,7 +19,9 @@
 
 class ucp_perf_cuda_request_manager {
 public:
-    __device__ ucp_perf_cuda_request_manager(size_t size) : m_size(size)
+    __device__
+    ucp_perf_cuda_request_manager(size_t size, ucp_device_request_t *requests)
+        : m_size(size), m_requests(&requests[size * threadIdx.x])
     {
         assert(m_size <= CAPACITY);
         for (size_t i = 0; i < m_size; ++i) {
@@ -70,7 +72,7 @@ private:
     static const size_t CAPACITY = 128;
 
     size_t               m_size;
-    ucp_device_request_t m_requests[CAPACITY];
+    ucp_device_request_t *m_requests;
     uint8_t              m_pending[UCX_BITSET_SIZE(CAPACITY)];
 };
 
@@ -222,9 +224,9 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
 
 template<ucs_device_level_t level, ucx_perf_cmd_t cmd>
 UCS_F_DEVICE ucs_status_t
-ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params, ucx_perf_counter_t idx)
+ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
+                        ucp_device_request_t &req)
 {
-    ucp_device_request_t req;
     ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
     if (status != UCS_OK) {
         return status;
@@ -243,9 +245,10 @@ __global__ void
 ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
                                   ucp_perf_cuda_params params)
 {
+    extern __shared__ ucp_device_request_t requests[];
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
-    ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding);
+    ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding, requests);
     ucs_status_t status;
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
@@ -268,7 +271,6 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
         } while (status == UCS_ERR_NO_RESOURCE);
 
         ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time);
-        __syncthreads();
     }
 
     while (request_mgr.get_pending_count() > 0) {
@@ -288,27 +290,28 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx,
                                        ucp_perf_cuda_params params,
                                        bool is_sender)
 {
+    extern __shared__ ucp_device_request_t requests[];
+    ucp_device_request_t &req             = requests[threadIdx.x];
     ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
     ucx_perf_counter_t max_iters          = ctx.max_iters;
     ucs_status_t status                   = UCS_OK;
 
     for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
         if (is_sender) {
-            status = ucp_perf_cuda_send_sync<level, cmd>(params, idx);
+            status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
             if (status != UCS_OK) {
                 break;
             }
             ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
         } else {
             ucx_perf_cuda_wait_sn(params.counter_recv, idx + 1);
-            status = ucp_perf_cuda_send_sync<level, cmd>(params, idx);
+            status = ucp_perf_cuda_send_sync<level, cmd>(params, idx, req);
             if (status != UCS_OK) {
                 break;
             }
         }
 
         ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time);
-        __syncthreads();
     }
 
     ctx.status = status;

From 9cd91ac7908105fa06c95769e6ab19783d08a889 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Mon, 22 Sep 2025 13:46:45 +0000
Subject: [PATCH 29/32] UCP/PERF: Aggregate MT result

---
 src/tools/perf/cuda/cuda_kernel.cuh    |  5 +++--
 src/tools/perf/cuda/ucp_cuda_kernel.cu |  1 +
 src/tools/perf/lib/libperf_int.h       | 20 +++-----------------
 src/tools/perf/lib/ucp_tests.cc        | 10 +++++-----
 src/tools/perf/lib/uct_tests.cc        |  8 ++++----
 5 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
index 4f60e4f1e49..9d9c42b598e 100644
--- a/src/tools/perf/cuda/cuda_kernel.cuh
+++ b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -163,11 +163,12 @@ public:
         size_t msg_length                 = ucx_perf_get_message_size(&m_perf.params);
         ucx_perf_counter_t last_completed = 0;
         ucx_perf_counter_t completed      = m_cpu_ctx->completed_iters;
-        while (1) {
+        unsigned thread_count             = m_perf.params.device_thread_count;
+        while (true) {
             ucx_perf_counter_t delta = completed - last_completed;
             if (delta > 0) {
                 // TODO: calculate latency percentile on kernel
-                ucx_perf_update(&m_perf, delta, msg_length);
+                ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
             } else if (completed >= m_perf.max_iter) {
                 break;
             }
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
index 5106512c7de..822dbcf22e5 100644
--- a/src/tools/perf/cuda/ucp_cuda_kernel.cu
+++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu
@@ -271,6 +271,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
         } while (status == UCS_ERR_NO_RESOURCE);
 
         ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time);
+        __syncthreads();
     }
 
     while (request_mgr.get_pending_count() > 0) {
diff --git a/src/tools/perf/lib/libperf_int.h b/src/tools/perf/lib/libperf_int.h
index 3b40ab9d9b7..3e808ced13d 100644
--- a/src/tools/perf/lib/libperf_int.h
+++ b/src/tools/perf/lib/libperf_int.h
@@ -229,12 +229,13 @@ static inline void ucx_perf_omp_barrier(ucx_perf_context_t *perf)
 
 static UCS_F_ALWAYS_INLINE void ucx_perf_update(ucx_perf_context_t *perf,
                                                 ucx_perf_counter_t iters,
+                                                ucx_perf_counter_t msgs,
                                                 size_t bytes_per_iter)
 {
     perf->current.time   = ucs_get_time();
     perf->current.iters += iters;
-    perf->current.bytes += bytes_per_iter * iters;
-    perf->current.msgs  += iters;
+    perf->current.bytes += msgs * bytes_per_iter;
+    perf->current.msgs  += msgs;
 
     if (iters == 1) {
         perf->timing_queue[perf->timing_queue_head] = perf->current.time -
@@ -254,21 +255,6 @@ static UCS_F_ALWAYS_INLINE void ucx_perf_update(ucx_perf_context_t *perf,
     }
 }
 
-static UCS_F_ALWAYS_INLINE void
-ucx_perf_update_multi(ucx_perf_context_t *perf, ucx_perf_counter_t iters,
-                      size_t bytes)
-{
-    perf->current.time   = ucs_get_time();
-    perf->current.iters += iters;
-    perf->current.bytes += bytes;
-    perf->current.msgs  += iters;
-    perf->prev_time      = perf->current.time;
-
-    if (ucs_likely(perf->current.iters < perf->params.max_iter)) {
-        ucx_perf_report(perf);
-    }
-}
-
 END_C_DECLS
 
 #endif
diff --git a/src/tools/perf/lib/ucp_tests.cc b/src/tools/perf/lib/ucp_tests.cc
index d683bfbc5cd..061c7f1119e 100644
--- a/src/tools/perf/lib/ucp_tests.cc
+++ b/src/tools/perf/lib/ucp_tests.cc
@@ -764,7 +764,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base<uint8_t> {
                 send(ep, send_buffer, send_length, send_datatype, sn, remote_addr, rkey);
                 recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn);
                 wait_recv_window(m_max_outstanding);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
                 ++sn;
             }
         } else if (my_index == 1) {
@@ -773,7 +773,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base<uint8_t> {
                 wait_recv_window(m_max_outstanding);
                 send(ep, send_buffer, send_length, send_datatype, sn,
                      remote_addr, rkey, m_perf.current.iters == 0);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
                 ++sn;
             }
         }
@@ -828,7 +828,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base<uint8_t> {
                 send(ep, send_buffer, send_length, send_datatype,
                      sn, remote_addr, rkey);
                 recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
                 ++sn;
             }
 
@@ -837,7 +837,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base<uint8_t> {
         } else if (my_index == 0) {
             UCX_PERF_TEST_FOREACH(&m_perf) {
                 recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
                 ++sn;
             }
 
@@ -850,7 +850,7 @@ class ucp_perf_test_runner : public ucp_perf_test_runner_base<uint8_t> {
             UCX_PERF_TEST_FOREACH(&m_perf) {
                 send(ep, send_buffer, send_length, send_datatype, sn,
                      remote_addr, rkey, m_perf.current.iters == 0);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
                 ++sn;
             }
 
diff --git a/src/tools/perf/lib/uct_tests.cc b/src/tools/perf/lib/uct_tests.cc
index 32e2de2efde..f7f44d5f213 100644
--- a/src/tools/perf/lib/uct_tests.cc
+++ b/src/tools/perf/lib/uct_tests.cc
@@ -451,7 +451,7 @@ class uct_perf_test_runner {
             UCX_PERF_TEST_FOREACH(&m_perf) {
                 send_b(ep, send_sn, send_sn - 1, buffer, length, remote_addr,
                        rkey, NULL);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
 
                 do {
                     progress_responder();
@@ -471,7 +471,7 @@ class uct_perf_test_runner {
 
                 send_b(ep, send_sn, send_sn - 1, buffer, length, remote_addr,
                        rkey, NULL);
-                ucx_perf_update(&m_perf, 1, length);
+                ucx_perf_update(&m_perf, 1, 1, length);
                 ++send_sn;
             }
         }
@@ -538,7 +538,7 @@ class uct_perf_test_runner {
                        &m_completion);
             }
 
-            ucx_perf_update(&m_perf, 1, length);
+            ucx_perf_update(&m_perf, 1, 1, length);
         }
 
         if (!flow_control) {
@@ -699,7 +699,7 @@ class uct_perf_test_runner {
         ucx_perf_get_time(&m_perf);
         ucs_assert(outstanding() == 0);
         if (my_index == 1) {
-            ucx_perf_update(&m_perf, 0, 0);
+            ucx_perf_update(&m_perf, 0, 1, 0);
         }
 
         return UCS_OK;

From 31793c25ac9d7b74816717958fa69379a693f131 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Mon, 22 Sep 2025 14:28:59 +0000
Subject: [PATCH 30/32] UCP/PERF: Temporary cuda_ipc fix

---
 src/uct/cuda/cuda_ipc/cuda_ipc.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh
index 173a5f019f3..bf24582807c 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh
@@ -201,9 +201,8 @@ UCS_F_DEVICE ucs_status_t uct_cuda_ipc_ep_put_single(
     }
 
     __syncthreads();
-    if (threadIdx.x == 0) {
-        comp->count  = 0;
-    }
+    // TODO: temporary fix, to be removed once cuda_ipc PR is merged
+    comp->count = 0;
     return UCS_OK;
 }
 

From 8265571be23fca2b7d94c8a6c854c9cb3c915b54 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Tue, 23 Sep 2025 06:34:30 +0000
Subject: [PATCH 31/32] UCP/PERF: Added MT CI tests

---
 contrib/test_jenkins.sh                              | 12 +++++++++++-
 .../ucx_perftest_config/test_types_ucp_device_cuda   | 10 ++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh
index d10276fdaec..260c526c75f 100755
--- a/contrib/test_jenkins.sh
+++ b/contrib/test_jenkins.sh
@@ -657,8 +657,18 @@ run_ucx_perftest_cuda_device() {
 	ucx_perftest="$ucx_inst/bin/ucx_perftest"
 	ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
 
-	# TODO: Run on all GPUs
+	# TODO: Run on all GPUs & NICs combinations
 	ucp_client_args="-a cuda:0 $(hostname)"
+	gda_tls="cuda_copy,rc,rc_gda"
+	cuda_ipc_tls="cuda_copy,rc,cuda_ipc"
+
+	# TODO: Run with cuda_ipc_tls
+	for tls in "$gda_tls"
+	do
+		export UCX_TLS=${tls}
+		run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
+	done
+	unset UCX_TLS
 
 	run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
 }
diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
index e984ea2b830..bf2e5bac214 100644
--- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda
+++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
@@ -7,3 +7,13 @@ ucp_device_cuda_multi_bw_1k_1thread          -t ucp_put_multi_bw -m cuda -s 256:
 ucp_device_cuda_multi_lat_1k_1thread         -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000
 ucp_device_cuda_partial_bw_1k_1thread        -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000
 ucp_device_cuda_partial_lat_1k_1thread       -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000
+
+# Increase number of threads after following fixes:
+# - Use thread-local memory instead of shared for requests (limit 48K)
+# - Fix WQE size limit of 1024
+ucp_device_cuda_single_bw_1k_32threads       -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32 -O 8
+ucp_device_cuda_single_lat_1k_32threads      -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32 -O 8
+ucp_device_cuda_multi_bw_1k_32threads        -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8
+ucp_device_cuda_multi_lat_1k_32threads       -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8
+ucp_device_cuda_partial_bw_1k_32threads      -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8
+ucp_device_cuda_partial_lat_1k_32threads     -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8

From f9957eb76fec690cc8a5269ea0d73a1388144552 Mon Sep 17 00:00:00 2001
From: Ilia Yastrebov <iyastrebov@nvidia.com>
Date: Tue, 23 Sep 2025 09:59:58 +0000
Subject: [PATCH 32/32] UCP/PERF: Reduced dimensions of CI tests due to WQE
 size 1024 limitation

---
 .../test_types_ucp_device_cuda                   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
index bf2e5bac214..86e24d125ae 100644
--- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda
+++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda
@@ -1,8 +1,8 @@
 #
 # UCP basic device cuda tests
 #
-ucp_device_cuda_single_bw_1k_1thread         -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
-ucp_device_cuda_single_lat_1k_1thread        -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
+ucp_device_cuda_single_bw_1k_1thread         -t ucp_put_single_bw -m cuda -s 1024 -n 10000
+ucp_device_cuda_single_lat_1k_1thread        -t ucp_put_single_lat -m cuda -s 1024 -n 10000
 ucp_device_cuda_multi_bw_1k_1thread          -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
 ucp_device_cuda_multi_lat_1k_1thread         -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000
 ucp_device_cuda_partial_bw_1k_1thread        -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000
@@ -11,9 +11,9 @@ ucp_device_cuda_partial_lat_1k_1thread       -t ucp_put_partial_lat -m cuda -s 2
 # Increase number of threads after following fixes:
 # - Use thread-local memory instead of shared for requests (limit 48K)
 # - Fix WQE size limit of 1024
-ucp_device_cuda_single_bw_1k_32threads       -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32 -O 8
-ucp_device_cuda_single_lat_1k_32threads      -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32 -O 8
-ucp_device_cuda_multi_bw_1k_32threads        -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8
-ucp_device_cuda_multi_lat_1k_32threads       -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8
-ucp_device_cuda_partial_bw_1k_32threads      -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 8
-ucp_device_cuda_partial_lat_1k_32threads     -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 8
+ucp_device_cuda_single_bw_1k_32threads       -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
+ucp_device_cuda_single_lat_1k_32threads      -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
+ucp_device_cuda_multi_bw_1k_32threads        -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
+ucp_device_cuda_multi_lat_1k_32threads       -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
+ucp_device_cuda_partial_bw_1k_32threads      -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
+ucp_device_cuda_partial_lat_1k_32threads     -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2