Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
78d0469
UCP/API: Work with offsets.
ofirfarjun7 Sep 29, 2025
77101e4
UCP/API: Add channel id.
ofirfarjun7 Sep 29, 2025
ad4a11d
UCP/API: Fix.
ofirfarjun7 Sep 29, 2025
0c6c60f
UCP/API: Add lengths and change multi API.
ofirfarjun7 Sep 29, 2025
a4fa0cc
UCP/API: Improve doc.
ofirfarjun7 Sep 29, 2025
78f365f
UCP/API: Remove unused param.
ofirfarjun7 Sep 29, 2025
719cd6c
UCP/API: Check elements fields.
ofirfarjun7 Sep 29, 2025
497fc22
UCP/API: Better contorol post atomic.
ofirfarjun7 Sep 29, 2025
0cb09ac
UCP/API: adapt cuda_ipc.
ofirfarjun7 Sep 29, 2025
e7ae20a
UCP/API: Fix.
ofirfarjun7 Sep 29, 2025
43302a5
UCP/API: add channel id to progress.
ofirfarjun7 Sep 30, 2025
ec4a087
UCP/API: Fix static check comment.
ofirfarjun7 Sep 30, 2025
31efa17
UCP/API: Fix comments.
ofirfarjun7 Sep 30, 2025
a22e889
UCP/DEVICE/API: Merge branch 'master' into topic/device-api-use-offsets
ofirfarjun7 Sep 30, 2025
bd14203
UCP/API: Improve.
ofirfarjun7 Sep 30, 2025
d930c60
UCP/API: Improve.
ofirfarjun7 Sep 30, 2025
feb8ed6
UCP/API: Check for elements fields value.
ofirfarjun7 Sep 30, 2025
89c3c5e
UCP/API: CR comments.
ofirfarjun7 Sep 30, 2025
4ca86c8
UCP/API: Fix cuda_ipc test.
ofirfarjun7 Oct 1, 2025
fa285c3
UCP/API: CR comments.
ofirfarjun7 Oct 1, 2025
c4d5dfb
UCP/API: Format.
ofirfarjun7 Oct 1, 2025
12f6f98
UCP/API: Minor.
ofirfarjun7 Oct 1, 2025
fd4aadf
UCP/API: Minor improve.
ofirfarjun7 Oct 1, 2025
375ba74
UCP/API: Minor.
ofirfarjun7 Oct 1, 2025
c0d188f
UCP/API: Bug fix.
ofirfarjun7 Oct 5, 2025
efcfd72
UCP/API: Improve.
ofirfarjun7 Oct 5, 2025
70fdc08
UCP/API: Temporary disable perftest checks.
ofirfarjun7 Oct 5, 2025
0de577e
UCP/API: Merge branch 'master' into topic/device-api-use-offsets
ofirfarjun7 Oct 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,6 @@ run_ucx_perftest_cuda_device() {
ucx_perftest="$ucx_inst/bin/ucx_perftest"
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"

# TODO: Run on all GPUs & NICs combinations
# TODO: Run on all GPUs & NICs combinations
ucp_client_args="-a cuda:0 $(hostname)"
gda_tls="cuda_copy,rc,rc_gda"
Expand Down
13 changes: 7 additions & 6 deletions contrib/ucx_perftest_config/test_types_ucp_device_cuda
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2
# Increase number of threads after following fixes:
# - Use thread-local memory instead of shared for requests (limit 48K)
# - Fix WQE size limit of 1024
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
# TODO - enable when wqe reserve is fixed.
# ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
# ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
# ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
# ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
80 changes: 39 additions & 41 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,9 @@ struct ucp_perf_cuda_params {
ucp_device_mem_list_handle_h mem_list;
size_t length;
unsigned *indices;
void **addresses;
uint64_t *remote_addresses;
size_t *local_offsets;
size_t *remote_offsets;
size_t *lengths;
uint64_t counter_remote;
uint64_t *counter_send;
uint64_t *counter_recv;
ucp_device_flags_t flags;
Expand All @@ -102,8 +101,8 @@ public:
{
ucp_device_mem_list_release(m_params.mem_list);
CUDA_CALL_WARN(cudaFree, m_params.indices);
CUDA_CALL_WARN(cudaFree, m_params.addresses);
CUDA_CALL_WARN(cudaFree, m_params.remote_addresses);
CUDA_CALL_WARN(cudaFree, m_params.local_offsets);
CUDA_CALL_WARN(cudaFree, m_params.remote_offsets);
CUDA_CALL_WARN(cudaFree, m_params.lengths);
}

Expand All @@ -113,13 +112,23 @@ private:
void init_mem_list(const ucx_perf_context_t &perf)
{
/* +1 for the counter */
size_t count = perf.params.msg_size_cnt + 1;
size_t count = perf.params.msg_size_cnt + 1;
size_t offset = 0;
ucp_device_mem_list_elem_t elems[count];

for (size_t i = 0; i < count; ++i) {
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY;
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
elems[i].memh = perf.ucp.send_memh;
elems[i].rkey = perf.ucp.rkey;
elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset);
elems[i].remote_addr = perf.ucp.remote_addr + offset;
elems[i].length = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
offset += elems[i].length;
}

ucp_device_mem_list_params_t params;
Expand All @@ -140,33 +149,30 @@ private:
void init_elements(const ucx_perf_context_t &perf)
{
/* +1 for the counter */
size_t count = perf.params.msg_size_cnt + 1;
size_t count = perf.params.msg_size_cnt + 1;
size_t offset = 0;

std::vector<unsigned> indices(count);
std::vector<void*> addresses(count);
std::vector<uint64_t> remote_addresses(count);
std::vector<size_t> local_offsets(count, 0);
std::vector<size_t> remote_offsets(count, 0);
std::vector<size_t> lengths(count);
for (unsigned i = 0, offset = 0; i < count; ++i) {
indices[i] = i;
addresses[i] = (char *)perf.send_buffer + offset;
remote_addresses[i] = perf.ucp.remote_addr + offset;
lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
offset += lengths[i];

for (unsigned i = 0; i < count; ++i) {
indices[i] = i;
lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
offset += lengths[i];
}

device_clone(&m_params.indices, indices.data(), count);
device_clone(&m_params.addresses, addresses.data(), count);
device_clone(&m_params.remote_addresses, remote_addresses.data(), count);
device_clone(&m_params.local_offsets, local_offsets.data(), count);
device_clone(&m_params.remote_offsets, remote_offsets.data(), count);
device_clone(&m_params.lengths, lengths.data(), count);
}

void init_counters(const ucx_perf_context_t &perf)
{
m_params.length = ucx_perf_get_message_size(&perf.params);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next pr: align

m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn(
(void*)perf.ucp.remote_addr,
m_params.length);
m_params.counter_send = ucx_perf_cuda_get_sn(perf.send_buffer,
m_params.length);
m_params.counter_recv = ucx_perf_cuda_get_sn(perf.recv_buffer,
Expand Down Expand Up @@ -195,28 +201,20 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
/* TODO: Change to ucp_device_counter_write */
*params.counter_send = idx + 1;
return ucp_device_put_single<level>(params.mem_list, params.indices[0],
params.addresses[0],
params.remote_addresses[0],
params.length + ONESIDED_SIGNAL_SIZE,
params.flags, &req);
0, 0,
params.length +
ONESIDED_SIGNAL_SIZE,
0, params.flags, &req);
case UCX_PERF_CMD_PUT_MULTI:
return ucp_device_put_multi<level>(params.mem_list, params.addresses,
params.remote_addresses,
params.lengths, 1,
params.counter_remote, params.flags,
return ucp_device_put_multi<level>(params.mem_list, 1, 0, params.flags,
&req);
case UCX_PERF_CMD_PUT_PARTIAL:{
case UCX_PERF_CMD_PUT_PARTIAL: {
unsigned counter_index = params.mem_list->mem_list_length - 1;
return ucp_device_put_multi_partial<level>(params.mem_list,
params.indices,
counter_index,
params.addresses,
params.remote_addresses,
params.lengths,
counter_index, 1,
params.counter_remote,
params.flags, &req);
}
return ucp_device_put_multi_partial<level>(
params.mem_list, params.indices, counter_index,
params.local_offsets, params.remote_offsets, params.lengths,
counter_index, 1, 0, 0, params.flags, &req);
}
}

return UCS_ERR_INVALID_PARAM;
Expand Down
Loading
Loading