Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a7a6f00
UCP/PERF: Added config for block count
iyastreb Sep 17, 2025
c27a9a8
UCP/PERF: Added tests CMDs for single/partial
iyastreb Sep 17, 2025
0927c2f
UCP/PERF: Compact form for message sizes
iyastreb Sep 17, 2025
5a4be0f
UCP/PERF: Device level option
iyastreb Sep 17, 2025
4c4a3a6
UCP/PERF: Support for IOV
iyastreb Sep 17, 2025
5a59ee2
UCP/PERF: Kernel dispatch macro
iyastreb Sep 17, 2025
0fcc18c
UCP/PERF: Fixed coverity warning
iyastreb Sep 17, 2025
50fbf79
UCP/PERF: Fixed build error
iyastreb Sep 17, 2025
d99de4b
UCP/PERF: Common send function for all APIs
iyastreb Sep 17, 2025
7e7fd34
UCP/PERF: Common element list for all APIs
iyastreb Sep 17, 2025
0733535
UCP/PERF: Common params for all APIs
iyastreb Sep 18, 2025
74f4b69
UCP/PERF: Report kernel status
iyastreb Sep 18, 2025
4015d74
UCP/PERF: Reduce number of kernel args
iyastreb Sep 18, 2025
39188ba
UCP/PERF: Progress fix from Thomas
iyastreb Sep 18, 2025
433c48d
UCP/PERF: TODO comment
iyastreb Sep 18, 2025
dc25dd4
UCP/PERF: Counters in params
iyastreb Sep 18, 2025
881d9ce
UCP/PERF: put multi
iyastreb Sep 18, 2025
6276578
UCP/PERF: put partial
iyastreb Sep 18, 2025
c4115ed
UCP/PERF: Merge branch 'master' into ucp-perf-device-level
iyastreb Sep 18, 2025
e235a43
UCP/PERF: Minor changes
iyastreb Sep 18, 2025
b8fe11f
UCP/PERF: Minor changes
iyastreb Sep 18, 2025
28454a4
UCP/PERF: Separate element for counter
iyastreb Sep 18, 2025
09a7201
UCP/PERF: Temporary fis for single, until counter write is merged
iyastreb Sep 18, 2025
9924504
UCP/PERF: Fixed memory corruption
iyastreb Sep 18, 2025
24e20f8
UCP/PERF: Send all elements in single
iyastreb Sep 18, 2025
8053fdd
UCP/PERF: Precise doc on blocks param
iyastreb Sep 19, 2025
cd48e9e
UCP/PERF: Exclude multi-thread tests from CI, add API tests
iyastreb Sep 19, 2025
a018823
UCP/PERF: Fixed ucp_device_progress_req
iyastreb Sep 22, 2025
3c4e837
UCP/PERF: Merge branch 'master' into ucp-perf-device-level
iyastreb Sep 22, 2025
82b253d
UCP/PERF: Allocate requests in shared memory
iyastreb Sep 22, 2025
9cd91ac
UCP/PERF: Aggregate MT result
iyastreb Sep 22, 2025
31793c2
UCP/PERF: Temporary cuda_ipc fix
iyastreb Sep 22, 2025
8265571
UCP/PERF: Added MT CI tests
iyastreb Sep 23, 2025
a08733f
UCP/PERF: Merged with master
iyastreb Sep 23, 2025
f9957eb
UCP/PERF: Reduced dimensions of CI tests due to WQE size 1024 limitation
iyastreb Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,7 @@ run_ucx_perftest_cuda_device() {
ucx_perftest="$ucx_inst/bin/ucx_perftest"
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"

# TODO: Run on all GPUs & NICs combinations
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to remove (can do in next pr)

# TODO: Run on all GPUs & NICs combinations
ucp_client_args="-a cuda:0 $(hostname)"
gda_tls="cuda_copy,rc,rc_gda"
Expand Down
22 changes: 16 additions & 6 deletions contrib/ucx_perftest_config/test_types_ucp_device_cuda
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
#
# UCP basic device cuda tests
#
ucp_device_cuda_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
# TODO - Increase number of threads after adjusting perftest.
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
# TODO - Increase number of threads after adjusting perftest.
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000
ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000
ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
ucp_device_cuda_multi_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000
ucp_device_cuda_partial_bw_1k_1thread -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000
ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000

# Increase number of threads after following fixes:
# - Use thread-local memory instead of shared for requests (limit 48K)
# - Fix WQE size limit of 1024
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
Comment on lines +11 to +19
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we test warp level?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be tested in the next PR

5 changes: 5 additions & 0 deletions src/tools/perf/api/libperf.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#define UCX_LIBPERF_H

#include <ucs/sys/compiler.h>
#include <ucs/sys/device_code.h>

BEGIN_C_DECLS

Expand All @@ -30,7 +31,9 @@ typedef enum {
typedef enum {
UCX_PERF_CMD_AM,
UCX_PERF_CMD_PUT,
UCX_PERF_CMD_PUT_SINGLE,
UCX_PERF_CMD_PUT_MULTI,
UCX_PERF_CMD_PUT_PARTIAL,
UCX_PERF_CMD_GET,
UCX_PERF_CMD_ADD,
UCX_PERF_CMD_FADD,
Expand Down Expand Up @@ -265,6 +268,7 @@ typedef struct ucx_perf_params {
ucs_memory_type_t recv_mem_type; /* Recv memory type */
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
ucs_device_level_t device_level; /* Device level for gdaki */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor - i'd remove gdaki

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done for all three

unsigned flags; /* See ucx_perf_test_flags. */

size_t *msg_size_list; /* Test message sizes list. The size
Expand All @@ -284,6 +288,7 @@ typedef struct ucx_perf_params {
double percentile_rank; /* The percentile rank of the percentile reported
in latency tests */
unsigned device_thread_count; /* Number of device threads */
unsigned device_block_count; /* Number of device blocks */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Number of device threads in block"
Need to make sure device_thread_count is not larger than max num of threads in block.


void *rte_group; /* Opaque RTE group handle */
ucx_perf_rte_t *rte; /* RTE functions used to exchange data */
Expand Down
81 changes: 66 additions & 15 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
}
}

UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length)
static UCS_F_ALWAYS_INLINE uint64_t *
ucx_perf_cuda_get_sn(const void *address, size_t length)
{
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t));
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length);
}

UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value)
UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
{
if (threadIdx.x == 0) {
while (*sn < value);
while (ucs_device_atomic64_read(sn) < value);
}
__syncthreads();
}
Expand All @@ -79,8 +80,8 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
return count;
}

UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
size_t from)
UCS_F_DEVICE size_t
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from)
{
for (size_t i = from; i < bits; i++) {
if (!UCX_BIT_GET(set, i)) {
Expand All @@ -90,6 +91,55 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
return bits;
}

#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use _ prefix for macro args

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

do { \
switch (cmd) { \
case UCX_PERF_CMD_PUT_SINGLE: \
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_MULTI: \
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_PARTIAL: \
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported cmd: %d", cmd); \
break; \
} \
} while (0)

#define UCX_KERNEL_DISPATCH(perf, func, ...) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. use _ prefix for macro args
  2. IMO add PERF to the name: UCX_PERF_KERNEL_DISPATCH

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done
I also refactored these macros to be more generic

do { \
ucs_device_level_t _level = perf.params.device_level; \
ucx_perf_cmd_t _cmd = perf.params.command; \
unsigned _blocks = perf.params.device_block_count; \
unsigned _threads = perf.params.device_thread_count; \
size_t _shared_size = _threads * perf.params.max_outstanding * \
sizeof(ucp_device_request_t); \
switch (_level) { \
case UCS_DEVICE_LEVEL_THREAD: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_WARP: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_BLOCK: \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Block and Grid are still not supported

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can still keep them here?

UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_GRID: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported level: %d", _level); \
break; \
} \
} while (0)

class ucx_perf_cuda_test_runner {
public:
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
Expand All @@ -110,17 +160,17 @@ public:
CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx);
}

ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; }

void wait_for_kernel(size_t msg_length)
void wait_for_kernel()
{
size_t msg_length = ucx_perf_get_message_size(&m_perf.params);
ucx_perf_counter_t last_completed = 0;
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
while (1) {
unsigned thread_count = m_perf.params.device_thread_count;
while (true) {
ucx_perf_counter_t delta = completed - last_completed;
if (delta > 0) {
// TODO: calculate latency percentile on kernel
ucx_perf_update(&m_perf, delta, msg_length);
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
} else if (completed >= m_perf.max_iter) {
break;
}
Expand All @@ -133,6 +183,8 @@ public:

protected:
ucx_perf_context_t &m_perf;
ucx_perf_cuda_context *m_cpu_ctx;
ucx_perf_cuda_context *m_gpu_ctx;

private:
void init_ctx()
Expand All @@ -142,17 +194,16 @@ private:
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
&m_gpu_ctx, m_cpu_ctx, 0);
}

ucx_perf_cuda_context *m_cpu_ctx;
ucx_perf_cuda_context *m_gpu_ctx;
};


template<typename Runner> ucs_status_t
ucx_perf_cuda_dispatch(ucx_perf_context_t *perf)
{
Runner runner(*perf);
if (perf->params.command == UCX_PERF_CMD_PUT_MULTI) {
if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) ||
(perf->params.command == UCX_PERF_CMD_PUT_SINGLE) ||
(perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) {
if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
return runner.run_pingpong();
} else if (perf->params.test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) {
Expand Down
Loading
Loading