Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
78d0469
UCP/API: Work with offsets.
ofirfarjun7 Sep 29, 2025
77101e4
UCP/API: Add channel id.
ofirfarjun7 Sep 29, 2025
ad4a11d
UCP/API: Fix.
ofirfarjun7 Sep 29, 2025
0c6c60f
UCP/API: Add lengths and change multi API.
ofirfarjun7 Sep 29, 2025
a4fa0cc
UCP/API: Improve doc.
ofirfarjun7 Sep 29, 2025
78f365f
UCP/API: Remove unused param.
ofirfarjun7 Sep 29, 2025
719cd6c
UCP/API: Check elements fields.
ofirfarjun7 Sep 29, 2025
497fc22
UCP/API: Better contorol post atomic.
ofirfarjun7 Sep 29, 2025
0cb09ac
UCP/API: adapt cuda_ipc.
ofirfarjun7 Sep 29, 2025
e7ae20a
UCP/API: Fix.
ofirfarjun7 Sep 29, 2025
43302a5
UCP/API: add channel id to progress.
ofirfarjun7 Sep 30, 2025
ec4a087
UCP/API: Fix static check comment.
ofirfarjun7 Sep 30, 2025
31efa17
UCP/API: Fix comments.
ofirfarjun7 Sep 30, 2025
a22e889
UCP/DEVICE/API: Merge branch 'master' into topic/device-api-use-offsets
ofirfarjun7 Sep 30, 2025
bd14203
UCP/API: Improve.
ofirfarjun7 Sep 30, 2025
d930c60
UCP/API: Improve.
ofirfarjun7 Sep 30, 2025
feb8ed6
UCP/API: Check for elements fields value.
ofirfarjun7 Sep 30, 2025
89c3c5e
UCP/API: CR comments.
ofirfarjun7 Sep 30, 2025
4ca86c8
UCP/API: Fix cuda_ipc test.
ofirfarjun7 Oct 1, 2025
fa285c3
UCP/API: CR comments.
ofirfarjun7 Oct 1, 2025
c4d5dfb
UCP/API: Format.
ofirfarjun7 Oct 1, 2025
12f6f98
UCP/API: Minor.
ofirfarjun7 Oct 1, 2025
fd4aadf
UCP/API: Minor improve.
ofirfarjun7 Oct 1, 2025
375ba74
UCP/API: Minor.
ofirfarjun7 Oct 1, 2025
c0d188f
UCP/API: Bug fix.
ofirfarjun7 Oct 5, 2025
efcfd72
UCP/API: Improve.
ofirfarjun7 Oct 5, 2025
70fdc08
UCP/API: Temporary disable perftest checks.
ofirfarjun7 Oct 5, 2025
0de577e
UCP/API: Merge branch 'master' into topic/device-api-use-offsets
ofirfarjun7 Oct 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 40 additions & 43 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public:

for (size_t i = 0; i < m_size; i++) {
if (UCX_BIT_GET(m_pending, i)) {
status = ucp_device_progress_req<level>(&m_requests[i]);
status = ucp_device_progress_req<level>(&m_requests[i], 0);
if (status == UCS_INPROGRESS) {
continue;
}
Expand Down Expand Up @@ -80,10 +80,9 @@ struct ucp_perf_cuda_params {
ucp_device_mem_list_handle_h mem_list;
size_t length;
unsigned *indices;
void **addresses;
uint64_t *remote_addresses;
size_t *local_offsets;
size_t *remote_offsets;
size_t *lengths;
uint64_t counter_remote;
uint64_t *counter_send;
uint64_t *counter_recv;
ucp_device_flags_t flags;
Expand All @@ -102,8 +101,8 @@ public:
{
ucp_device_mem_list_release(m_params.mem_list);
CUDA_CALL_WARN(cudaFree, m_params.indices);
CUDA_CALL_WARN(cudaFree, m_params.addresses);
CUDA_CALL_WARN(cudaFree, m_params.remote_addresses);
CUDA_CALL_WARN(cudaFree, m_params.local_offsets);
CUDA_CALL_WARN(cudaFree, m_params.remote_offsets);
CUDA_CALL_WARN(cudaFree, m_params.lengths);
}

Expand All @@ -115,11 +114,20 @@ private:
/* +1 for the counter */
size_t count = perf.params.msg_size_cnt + 1;
ucp_device_mem_list_elem_t elems[count];
size_t offset = 0;
for (size_t i = 0; i < count; ++i) {
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY;
elems[i].memh = perf.ucp.send_memh;
elems[i].rkey = perf.ucp.rkey;
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
elems[i].memh = perf.ucp.send_memh;
elems[i].rkey = perf.ucp.rkey;
elems[i].local_addr = (char*)perf.send_buffer + offset;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: UCS_PTR_BYTE_OFFSET

elems[i].remote_addr = perf.ucp.remote_addr + offset;
elems[i].length = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next pr: align

offset += perf.params.msg_size_list[i];
}

ucp_device_mem_list_params_t params;
Expand All @@ -143,30 +151,27 @@ private:
size_t count = perf.params.msg_size_cnt + 1;

std::vector<unsigned> indices(count);
std::vector<void*> addresses(count);
std::vector<uint64_t> remote_addresses(count);
std::vector<size_t> local_offsets(count);
std::vector<size_t> remote_offsets(count);
std::vector<size_t> lengths(count);
for (unsigned i = 0, offset = 0; i < count; ++i) {
indices[i] = i;
addresses[i] = (char *)perf.send_buffer + offset;
remote_addresses[i] = perf.ucp.remote_addr + offset;
lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
offset += lengths[i];
indices[i] = i;
local_offsets[i] = 0;
remote_offsets[i] = 0;
lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
perf.params.msg_size_list[i];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next pr: align

offset += lengths[i];
}

device_clone(&m_params.indices, indices.data(), count);
device_clone(&m_params.addresses, addresses.data(), count);
device_clone(&m_params.remote_addresses, remote_addresses.data(), count);
device_clone(&m_params.local_offsets, local_offsets.data(), count);
device_clone(&m_params.remote_offsets, remote_offsets.data(), count);
device_clone(&m_params.lengths, lengths.data(), count);
}

void init_counters(const ucx_perf_context_t &perf)
{
m_params.length = ucx_perf_get_message_size(&perf.params);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next pr: align

m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn(
(void*)perf.ucp.remote_addr,
m_params.length);
m_params.counter_send = ucx_perf_cuda_get_sn(perf.send_buffer,
m_params.length);
m_params.counter_recv = ucx_perf_cuda_get_sn(perf.recv_buffer,
Expand Down Expand Up @@ -194,29 +199,21 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
case UCX_PERF_CMD_PUT_SINGLE:
/* TODO: Change to ucp_device_counter_write */
*params.counter_send = idx + 1;
return ucp_device_put_single<level>(params.mem_list, params.indices[0],
params.addresses[0],
params.remote_addresses[0],
params.length + ONESIDED_SIGNAL_SIZE,
return ucp_device_put_single<level>(params.mem_list, 0,
params.indices[0], 0, 0,
params.length +
ONESIDED_SIGNAL_SIZE,
params.flags, &req);
case UCX_PERF_CMD_PUT_MULTI:
return ucp_device_put_multi<level>(params.mem_list, params.addresses,
params.remote_addresses,
params.lengths, 1,
params.counter_remote, params.flags,
return ucp_device_put_multi<level>(params.mem_list, 0, 1, params.flags,
&req);
case UCX_PERF_CMD_PUT_PARTIAL:{
case UCX_PERF_CMD_PUT_PARTIAL: {
unsigned counter_index = params.mem_list->mem_list_length - 1;
return ucp_device_put_multi_partial<level>(params.mem_list,
params.indices,
counter_index,
params.addresses,
params.remote_addresses,
params.lengths,
counter_index, 1,
params.counter_remote,
params.flags, &req);
}
return ucp_device_put_multi_partial<level>(
params.mem_list, 0, params.indices, counter_index,
params.local_offsets, params.remote_offsets, params.lengths,
counter_index, 1, 0, params.flags, &req);
}
}

return UCS_ERR_INVALID_PARAM;
Expand All @@ -233,7 +230,7 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
}

do {
status = ucp_device_progress_req<level>(&req);
status = ucp_device_progress_req<level>(&req, 0);
} while (status == UCS_INPROGRESS);

return status;
Expand Down
Loading
Loading