Skip to content
Draft
20 changes: 13 additions & 7 deletions contrib/ucx_perftest_config/test_types_ucp_device_cuda
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,16 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2
# Increase number of threads after following fixes:
# - Use thread-local memory instead of shared for requests (limit 48K)
# - Fix WQE size limit of 1024
# TODO - enable when wqe reserve is fixed.
# ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
# ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
# ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2

ucp_device_cuda_single_bw_1k_1warp -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 -L warp
ucp_device_cuda_single_lat_1k_1warp -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 -L warp
ucp_device_cuda_multi_bw_1k_1warp -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
ucp_device_cuda_multi_lat_1k_1warp -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp
ucp_device_cuda_partial_bw_1k_1warp -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
ucp_device_cuda_partial_lat_1k_1warp -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp
6 changes: 3 additions & 3 deletions src/tools/perf/api/libperf.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,9 @@ typedef struct ucx_perf_params {
ucx_perf_wait_mode_t wait_mode; /* How to wait */
ucs_memory_type_t send_mem_type; /* Send memory type */
ucs_memory_type_t recv_mem_type; /* Recv memory type */
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
ucs_device_level_t device_level; /* Device level for gdaki */
ucx_perf_accel_dev_t send_device; /* Send memory device */
ucx_perf_accel_dev_t recv_device; /* Recv memory device */
ucs_device_level_t device_level; /* Device level */
unsigned flags; /* See ucx_perf_test_flags. */

size_t *msg_size_list; /* Test message sizes list. The size
Expand Down
172 changes: 102 additions & 70 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,41 @@ UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
return globaltimer;
}

UCS_F_DEVICE void
ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
ucx_perf_counter_t completed,
ucx_perf_counter_t max_iters,
ucx_perf_cuda_time_t &last_report_time)
{
if (threadIdx.x == 0) {
ucx_perf_cuda_time_t current_time = ucx_perf_cuda_get_time_ns();
if (((current_time - last_report_time) >= ctx.report_interval_ns) ||
(completed >= max_iters)) {
ctx.completed_iters = completed;
last_report_time = current_time;
class ucx_perf_cuda_reporter {
public:
__device__
ucx_perf_cuda_reporter(ucx_perf_cuda_context &ctx) :
m_ctx(ctx),
m_max_iters(ctx.max_iters),
m_next_report_iter(1),
m_last_report_time(ucx_perf_cuda_get_time_ns()),
m_report_interval_ns(ctx.report_interval_ns / 5)
{
}

__device__ inline void
update_report(ucx_perf_counter_t completed)
{
if ((threadIdx.x == 0) && ucs_unlikely(completed >= m_next_report_iter)) {
ucx_perf_cuda_time_t cur_time = ucx_perf_cuda_get_time_ns();
ucx_perf_cuda_time_t iter_time = (cur_time - m_last_report_time) /
(completed - m_ctx.completed_iters);
m_last_report_time = cur_time;
m_ctx.completed_iters = completed;
__threadfence();

m_next_report_iter = ucs_min(completed + (m_report_interval_ns / iter_time),
m_max_iters);
}
}
}

private:
ucx_perf_cuda_context &m_ctx;
ucx_perf_counter_t m_max_iters;
ucx_perf_counter_t m_next_report_iter;
ucx_perf_cuda_time_t m_last_report_time;
ucx_perf_cuda_time_t m_report_interval_ns;
};

static UCS_F_ALWAYS_INLINE uint64_t *
ucx_perf_cuda_get_sn(const void *address, size_t length)
Expand All @@ -63,83 +82,91 @@ UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
__syncthreads();
}

/* Simple bitset */
#define UCX_BIT_MASK(bit) (1 << ((bit) & (CHAR_BIT - 1)))
#define UCX_BIT_SET(set, bit) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
#define UCX_BIT_RESET(set, bit) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK(bit))
#define UCX_BIT_GET(set, bit) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK(bit))
#define UCX_BITSET_SIZE(bits) ((bits + CHAR_BIT - 1) / CHAR_BIT)

UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
size_t count = 0;
for (size_t i = 0; i < bits; i++) {
if (UCX_BIT_GET(set, i)) {
count++;
}
template<ucs_device_level_t level>
__host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
{
switch (level) {
case UCS_DEVICE_LEVEL_THREAD: return tid;
/* TODO: use UCS_DEVICE_NUM_THREADS_IN_WARP */
case UCS_DEVICE_LEVEL_WARP: return tid / 32;
default: return 0;
}
return count;
}

#define UCX_PERF_THREAD_INDEX_SET(_level, _tid, _outval) \
(_outval) = ucx_perf_cuda_thread_index<_level>(_tid)

/* Simple bitset */
#define UCX_BIT_TYPE uint64_t
#define UCX_BIT_SIZE (sizeof(UCX_BIT_TYPE) * CHAR_BIT)
#define UCX_BIT_MASK(bit) (1ULL << ((bit) & (UCX_BIT_SIZE - 1)))
#define UCX_BIT_SET(set, bit) (set[(bit)/UCX_BIT_SIZE] |= UCX_BIT_MASK(bit))
#define UCX_BIT_RESET(set, bit) (set[(bit)/UCX_BIT_SIZE] &= ~UCX_BIT_MASK(bit))
#define UCX_BIT_GET(set, bit) (set[(bit)/UCX_BIT_SIZE] & UCX_BIT_MASK(bit))
#define UCX_BITSET_SIZE(bits) ((bits + UCX_BIT_SIZE - 1) / UCX_BIT_SIZE)

UCS_F_DEVICE size_t
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from)
ucx_bitset_ffns(const UCX_BIT_TYPE *set, size_t bits)
{
for (size_t i = from; i < bits; i++) {
if (!UCX_BIT_GET(set, i)) {
return i;
for (size_t i = 0; i < UCX_BITSET_SIZE(bits); ++i) {
size_t bit = __ffsll(~set[i]);
if (bit) {
return i * UCX_BIT_SIZE + bit - 1;
}
}
return bits;
}

#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
do { \
switch (cmd) { \
case UCX_PERF_CMD_PUT_SINGLE: \
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_MULTI: \
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_PARTIAL: \
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported cmd: %d", cmd); \
break; \
} \
} while (0)
#define UCX_PERF_SWITCH_CMD(_cmd, _func, ...) \
switch (_cmd) { \
case UCX_PERF_CMD_PUT_SINGLE: \
_func(UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_MULTI: \
_func(UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_PARTIAL: \
_func(UCX_PERF_CMD_PUT_PARTIAL, __VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported cmd: %d", _cmd); \
break; \
}

#define UCX_KERNEL_DISPATCH(perf, func, ...) \
do { \
ucs_device_level_t _level = perf.params.device_level; \
ucx_perf_cmd_t _cmd = perf.params.command; \
unsigned _blocks = perf.params.device_block_count; \
unsigned _threads = perf.params.device_thread_count; \
size_t _shared_size = _threads * perf.params.max_outstanding * \
sizeof(ucp_device_request_t); \
switch (_level) { \
#define UCX_PERF_SWITCH_LEVEL(_level, _func, ...) \
switch (_level) { \
case UCS_DEVICE_LEVEL_THREAD: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
_func(UCS_DEVICE_LEVEL_THREAD, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_WARP: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
_func(UCS_DEVICE_LEVEL_WARP, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_BLOCK: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
case UCS_DEVICE_LEVEL_GRID: \
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
_shared_size, func, __VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported level: %d", _level); \
break; \
} \
}

#define UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL(_cmd, _level, _perf, _kernel, ...) \
do { \
unsigned _blocks = _perf.params.device_block_count; \
unsigned _threads = _perf.params.device_thread_count; \
size_t _shared_size = _perf.params.max_outstanding * \
sizeof(ucp_device_request_t) * \
ucx_perf_cuda_thread_index<_level>(_threads); \
_kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>>(__VA_ARGS__); \
} while (0)

#define UCX_PERF_KERNEL_DISPATCH_CMD(_level, _perf, _kernel, ...) \
UCX_PERF_SWITCH_CMD(_perf.params.command, UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL, \
_level, _perf, _kernel, __VA_ARGS__);

#define UCX_PERF_KERNEL_DISPATCH(_perf, _kernel, ...) \
UCX_PERF_SWITCH_LEVEL(_perf.params.device_level, UCX_PERF_KERNEL_DISPATCH_CMD, \
_perf, _kernel, __VA_ARGS__);


class ucx_perf_cuda_test_runner {
public:
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
Expand All @@ -151,7 +178,7 @@ public:
m_cpu_ctx->completed_iters = 0;
m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
ULONG_MAX :
ucs_time_to_nsec(perf.report_interval) / 100;
ucs_time_to_nsec(perf.report_interval);
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
}

Expand All @@ -166,11 +193,16 @@ public:
ucx_perf_counter_t last_completed = 0;
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
unsigned thread_count = m_perf.params.device_thread_count;
ucs_device_level_t level = m_perf.params.device_level;
unsigned msgs_per_iter;
UCX_PERF_SWITCH_LEVEL(level, UCX_PERF_THREAD_INDEX_SET, thread_count,
msgs_per_iter);

while (true) {
ucx_perf_counter_t delta = completed - last_completed;
if (delta > 0) {
// TODO: calculate latency percentile on kernel
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
ucx_perf_update(&m_perf, delta, delta * msgs_per_iter, msg_length);
} else if (completed >= m_perf.max_iter) {
break;
}
Expand Down
Loading
Loading