-
Notifications
You must be signed in to change notification settings - Fork 490
UCP/PERF: UCP tests with configurable level/api/batch #10893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a7a6f00
c27a9a8
0927c2f
5a4be0f
4c4a3a6
5a59ee2
0fcc18c
50fbf79
d99de4b
7e7fd34
0733535
74f4b69
4015d74
39188ba
433c48d
dc25dd4
881d9ce
6276578
c4115ed
e235a43
b8fe11f
28454a4
09a7201
9924504
24e20f8
8053fdd
cd48e9e
a018823
3c4e837
82b253d
9cd91ac
31793c2
8265571
a08733f
f9957eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,19 @@ | ||
# | ||
# UCP basic device cuda tests | ||
# | ||
ucp_device_cuda_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 | ||
# TODO - Increase number of threads after adjusting perftest. | ||
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32 | ||
ucp_device_cuda_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 | ||
# TODO - Increase number of threads after adjusting perftest. | ||
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32 | ||
ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000 | ||
ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000 | ||
ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 | ||
ucp_device_cuda_multi_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 | ||
ucp_device_cuda_partial_bw_1k_1thread -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 | ||
ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 | ||
|
||
# Increase number of threads after following fixes: | ||
# - Use thread-local memory instead of shared for requests (limit 48K) | ||
# - Fix WQE size limit of 1024 | ||
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 | ||
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 | ||
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 | ||
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 | ||
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 | ||
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 | ||
Comment on lines
+11
to
+19
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we test warp level? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will be tested in the next PR |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
#define UCX_LIBPERF_H | ||
|
||
#include <ucs/sys/compiler.h> | ||
#include <ucs/sys/device_code.h> | ||
|
||
BEGIN_C_DECLS | ||
|
||
|
@@ -30,7 +31,9 @@ typedef enum { | |
typedef enum { | ||
UCX_PERF_CMD_AM, | ||
UCX_PERF_CMD_PUT, | ||
UCX_PERF_CMD_PUT_SINGLE, | ||
UCX_PERF_CMD_PUT_MULTI, | ||
UCX_PERF_CMD_PUT_PARTIAL, | ||
UCX_PERF_CMD_GET, | ||
UCX_PERF_CMD_ADD, | ||
UCX_PERF_CMD_FADD, | ||
|
@@ -265,6 +268,7 @@ typedef struct ucx_perf_params { | |
ucs_memory_type_t recv_mem_type; /* Recv memory type */ | ||
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */ | ||
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */ | ||
ucs_device_level_t device_level; /* Device level for gdaki */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor - i'd remove gdaki There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done for all three |
||
unsigned flags; /* See ucx_perf_test_flags. */ | ||
|
||
size_t *msg_size_list; /* Test message sizes list. The size | ||
|
@@ -284,6 +288,7 @@ typedef struct ucx_perf_params { | |
double percentile_rank; /* The percentile rank of the percentile reported | ||
in latency tests */ | ||
unsigned device_thread_count; /* Number of device threads */ | ||
unsigned device_block_count; /* Number of device blocks */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Number of device threads in block" |
||
|
||
void *rte_group; /* Opaque RTE group handle */ | ||
ucx_perf_rte_t *rte; /* RTE functions used to exchange data */ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,15 +49,16 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx, | |
} | ||
} | ||
|
||
UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length) | ||
static UCS_F_ALWAYS_INLINE uint64_t * | ||
ucx_perf_cuda_get_sn(const void *address, size_t length) | ||
{ | ||
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t)); | ||
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length); | ||
} | ||
|
||
UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value) | ||
UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value) | ||
{ | ||
if (threadIdx.x == 0) { | ||
while (*sn < value); | ||
while (ucs_device_atomic64_read(sn) < value); | ||
yosefe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
__syncthreads(); | ||
} | ||
|
@@ -79,8 +80,8 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) { | |
return count; | ||
} | ||
|
||
UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, | ||
size_t from) | ||
UCS_F_DEVICE size_t | ||
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from) | ||
{ | ||
for (size_t i = from; i < bits; i++) { | ||
if (!UCX_BIT_GET(set, i)) { | ||
|
@@ -90,6 +91,55 @@ UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits, | |
return bits; | ||
} | ||
|
||
#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use _ prefix for macro args There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
do { \ | ||
switch (cmd) { \ | ||
case UCX_PERF_CMD_PUT_SINGLE: \ | ||
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \ | ||
break; \ | ||
case UCX_PERF_CMD_PUT_MULTI: \ | ||
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \ | ||
break; \ | ||
case UCX_PERF_CMD_PUT_PARTIAL: \ | ||
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \ | ||
break; \ | ||
default: \ | ||
ucs_error("Unsupported cmd: %d", cmd); \ | ||
break; \ | ||
} \ | ||
} while (0) | ||
|
||
#define UCX_KERNEL_DISPATCH(perf, func, ...) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
do { \ | ||
ucs_device_level_t _level = perf.params.device_level; \ | ||
ucx_perf_cmd_t _cmd = perf.params.command; \ | ||
unsigned _blocks = perf.params.device_block_count; \ | ||
unsigned _threads = perf.params.device_thread_count; \ | ||
size_t _shared_size = _threads * perf.params.max_outstanding * \ | ||
sizeof(ucp_device_request_t); \ | ||
switch (_level) { \ | ||
case UCS_DEVICE_LEVEL_THREAD: \ | ||
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\ | ||
_shared_size, func, __VA_ARGS__); \ | ||
break; \ | ||
case UCS_DEVICE_LEVEL_WARP: \ | ||
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\ | ||
_shared_size, func, __VA_ARGS__); \ | ||
break; \ | ||
case UCS_DEVICE_LEVEL_BLOCK: \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Block and Grid are still not supported There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can still keep them here? |
||
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\ | ||
_shared_size, func, __VA_ARGS__); \ | ||
break; \ | ||
case UCS_DEVICE_LEVEL_GRID: \ | ||
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\ | ||
_shared_size, func, __VA_ARGS__); \ | ||
break; \ | ||
default: \ | ||
ucs_error("Unsupported level: %d", _level); \ | ||
break; \ | ||
} \ | ||
} while (0) | ||
|
||
class ucx_perf_cuda_test_runner { | ||
public: | ||
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf) | ||
|
@@ -110,17 +160,17 @@ public: | |
CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx); | ||
} | ||
|
||
ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; } | ||
|
||
void wait_for_kernel(size_t msg_length) | ||
void wait_for_kernel() | ||
{ | ||
size_t msg_length = ucx_perf_get_message_size(&m_perf.params); | ||
ucx_perf_counter_t last_completed = 0; | ||
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters; | ||
while (1) { | ||
unsigned thread_count = m_perf.params.device_thread_count; | ||
while (true) { | ||
ucx_perf_counter_t delta = completed - last_completed; | ||
if (delta > 0) { | ||
// TODO: calculate latency percentile on kernel | ||
ucx_perf_update(&m_perf, delta, msg_length); | ||
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length); | ||
} else if (completed >= m_perf.max_iter) { | ||
break; | ||
} | ||
|
@@ -133,6 +183,8 @@ public: | |
|
||
protected: | ||
ucx_perf_context_t &m_perf; | ||
ucx_perf_cuda_context *m_cpu_ctx; | ||
ucx_perf_cuda_context *m_gpu_ctx; | ||
|
||
private: | ||
void init_ctx() | ||
|
@@ -142,17 +194,16 @@ private: | |
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer, | ||
&m_gpu_ctx, m_cpu_ctx, 0); | ||
} | ||
|
||
ucx_perf_cuda_context *m_cpu_ctx; | ||
ucx_perf_cuda_context *m_gpu_ctx; | ||
}; | ||
|
||
|
||
template<typename Runner> ucs_status_t | ||
ucx_perf_cuda_dispatch(ucx_perf_context_t *perf) | ||
{ | ||
Runner runner(*perf); | ||
if (perf->params.command == UCX_PERF_CMD_PUT_MULTI) { | ||
if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) || | ||
(perf->params.command == UCX_PERF_CMD_PUT_SINGLE) || | ||
(perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) { | ||
if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) { | ||
return runner.run_pingpong(); | ||
} else if (perf->params.test_type == UCX_PERF_TEST_TYPE_STREAM_UNI) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to remove (can do in next pr)