Skip to content

Commit b1215a8

Browse files
Merge branch 'openucx:master' into patch-1
2 parents a5d439a + 0c6268a commit b1215a8

File tree

17 files changed

+476
-141
lines changed

17 files changed

+476
-141
lines changed

contrib/test_jenkins.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -657,10 +657,18 @@ run_ucx_perftest_cuda_device() {
657657
ucx_perftest="$ucx_inst/bin/ucx_perftest"
658658
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
659659

660-
# TODO: Run on all GPUs
660+
# TODO: Run on all GPUs & NICs combinations
661661
ucp_client_args="-a cuda:0 $(hostname)"
662+
gda_tls="cuda_copy,rc,rc_gda"
663+
cuda_ipc_tls="cuda_copy,rc,cuda_ipc"
662664

663-
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
665+
# TODO: Run with cuda_ipc_tls
666+
for tls in "$gda_tls"
667+
do
668+
export UCX_TLS=${tls}
669+
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
670+
done
671+
unset UCX_TLS
664672
}
665673

666674
#

contrib/ucx_perftest_config/test_types_ucp_device_cuda

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# UCP basic device cuda tests
33
#
44
ucp_device_cuda_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 1024 -n 10000
5-
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 128
5+
# TODO - Increase number of threads after adjusting perftest.
6+
ucp_device_cuda_bw_1k_128threads -t ucp_put_multi_bw -m cuda -s 1024 -n 10000 -T 32
67
ucp_device_cuda_lat_1k_1thread -t ucp_put_multi_lat -m cuda -s 1024 -n 10000
7-
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 128
8+
# TODO - Increase number of threads after adjusting perftest.
9+
ucp_device_cuda_lat_1k_128threads -t ucp_put_multi_lat -m cuda -s 1024 -n 10000 -T 32

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,11 @@ UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
7979
return count;
8080
}
8181

82-
UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {
82+
UCS_F_DEVICE size_t ucx_bitset_ffns(const uint8_t *set, size_t bits,
83+
size_t from)
84+
{
8385
for (size_t i = from; i < bits; i++) {
84-
if (UCX_BIT_GET(set, i)) {
86+
if (!UCX_BIT_GET(set, i)) {
8587
return i;
8688
}
8789
}

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818

1919
class ucp_perf_cuda_request_manager {
2020
public:
21-
__device__ ucp_perf_cuda_request_manager(size_t size) : m_size(size)
21+
__device__
22+
ucp_perf_cuda_request_manager(size_t size, ucp_device_request *requests) :
23+
m_size(size), m_requests(requests)
2224
{
2325
assert(m_size <= CAPACITY);
2426
for (size_t i = 0; i < m_size; ++i) {
@@ -54,7 +56,7 @@ public:
5456
__device__ ucp_device_request_t &get_request()
5557
{
5658
assert(get_pending_count() < m_size);
57-
size_t index = ucx_bitset_ffs(m_pending, m_size, 0);
59+
size_t index = ucx_bitset_ffns(m_pending, m_size, 0);
5860
UCX_BIT_SET(m_pending, index);
5961
return m_requests[index];
6062
}
@@ -69,7 +71,7 @@ private:
6971
static const size_t CAPACITY = 128;
7072

7173
size_t m_size;
72-
ucp_device_request_t m_requests[CAPACITY];
74+
ucp_device_request_t *m_requests;
7375
uint8_t m_pending[UCX_BITSET_SIZE(CAPACITY)];
7476
};
7577

@@ -81,24 +83,29 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
8183
const void *address, uint64_t remote_address,
8284
size_t length)
8385
{
86+
extern __shared__ ucp_device_request requests[];
8487
ucx_perf_cuda_time_t last_report_time = ucx_perf_cuda_get_time_ns();
8588
ucx_perf_counter_t max_iters = ctx.max_iters;
8689
uint64_t *sn = ucx_perf_cuda_get_sn(address, length);
87-
ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding);
90+
ucp_device_request *thread_requests =
91+
&requests[ctx.max_outstanding * threadIdx.x];
92+
ucp_perf_cuda_request_manager request_mgr(ctx.max_outstanding,
93+
thread_requests);
8894
ucs_status_t status;
8995

9096
for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
9197
while (request_mgr.get_pending_count() >= ctx.max_outstanding) {
9298
status = request_mgr.progress<level>(1);
93-
if (status != UCS_OK) {
99+
if (UCS_STATUS_IS_ERR(status)) {
94100
break;
95101
}
96102
}
97103

98104
*sn = idx + 1;
99105
ucp_device_request_t &req = request_mgr.get_request();
100106
status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
101-
remote_address, length, 0, &req);
107+
remote_address, length,
108+
UCP_DEVICE_FLAG_NODELAY, &req);
102109
if (status != UCS_OK) {
103110
break;
104111
}
@@ -109,7 +116,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,
109116

110117
while (request_mgr.get_pending_count() > 0) {
111118
status = request_mgr.progress<level>(max_iters);
112-
if (status != UCS_OK) {
119+
if (UCS_STATUS_IS_ERR(status)) {
113120
break;
114121
}
115122
}
@@ -135,17 +142,19 @@ ucp_perf_cuda_put_single(ucp_device_mem_list_handle_h mem_list,
135142
unsigned mem_list_index, const void *address,
136143
uint64_t remote_address, size_t length)
137144
{
138-
ucp_device_request_t req;
145+
extern __shared__ ucp_device_request requests[];
146+
ucp_device_request *req = &requests[threadIdx.x];
139147
ucs_status_t status;
140148

141149
status = ucp_device_put_single<level>(mem_list, mem_list_index, address,
142-
remote_address, length, 0, &req);
150+
remote_address, length,
151+
UCP_DEVICE_FLAG_NODELAY, req);
143152
if (status != UCS_OK) {
144153
return status;
145154
}
146155

147156
do {
148-
status = ucp_device_progress_req<level>(&req);
157+
status = ucp_device_progress_req<level>(req);
149158
} while (status == UCS_INPROGRESS);
150159

151160
return status;
@@ -220,8 +229,9 @@ public:
220229
ucp_perf_barrier(&m_perf);
221230
ucx_perf_test_start_clock(&m_perf);
222231

223-
ucp_perf_cuda_put_multi_latency_kernel
224-
<UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
232+
ucp_perf_cuda_put_multi_latency_kernel<UCS_DEVICE_LEVEL_THREAD>
233+
<<<1, thread_count,
234+
thread_count * sizeof(ucp_device_request)>>>(
225235
gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
226236
m_perf.ucp.remote_addr, length, m_perf.recv_buffer, my_index);
227237
CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
@@ -250,10 +260,12 @@ public:
250260
}
251261

252262
unsigned thread_count = m_perf.params.device_thread_count;
253-
ucp_perf_cuda_put_multi_bw_kernel
254-
<UCS_DEVICE_LEVEL_THREAD><<<1, thread_count>>>(
255-
gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
256-
m_perf.ucp.remote_addr, length);
263+
ucp_perf_cuda_put_multi_bw_kernel<UCS_DEVICE_LEVEL_THREAD>
264+
<<<1, thread_count,
265+
thread_count * m_perf.params.max_outstanding *
266+
sizeof(ucp_device_request)>>>(
267+
gpu_ctx(), handle.get(), 0, m_perf.send_buffer,
268+
m_perf.ucp.remote_addr, length);
257269
CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetLastError);
258270
wait_for_kernel(length);
259271
} else if (my_index == 0) {

src/ucp/api/device/ucp_device_impl.h

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,23 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep,
6060
}
6161

6262

63+
/**
64+
* Macro for device put operations with retry logic
65+
*/
66+
#define UCP_DEVICE_PUT_BLOCKING(_level, _uct_device_ep_put, _device_ep, ...) \
67+
({ \
68+
ucs_status_t _status; \
69+
do { \
70+
_status = _uct_device_ep_put<_level>(_device_ep, __VA_ARGS__); \
71+
if (_status != UCS_ERR_NO_RESOURCE) { \
72+
break; \
73+
} \
74+
_status = uct_device_ep_progress<_level>(_device_ep); \
75+
} while (!UCS_STATUS_IS_ERR(_status)); \
76+
_status; \
77+
})
78+
79+
6380
UCS_F_DEVICE ucs_status_t ucp_device_prepare_single(
6481
ucp_device_mem_list_handle_h mem_list_h, unsigned mem_list_index,
6582
ucp_device_request_t *req, uct_device_ep_t *&device_ep,
@@ -115,6 +132,7 @@ ucp_device_prepare_multi(ucp_device_mem_list_handle_h mem_list_h,
115132
*
116133
* The routine returns a request that can be progressed and checked for
117134
* completion with @ref ucp_device_progress_req.
135+
* The routine returns only after the message has been posted or an error has occurred.
118136
*
119137
* This routine can be called repeatedly with the same handle and different
120138
* addresses and length. The flags parameter can be used to modify the behavior
@@ -149,8 +167,9 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single(
149167
return status;
150168
}
151169

152-
return uct_device_ep_put_single<level>(device_ep, uct_elem, address,
153-
remote_address, length, flags, comp);
170+
return UCP_DEVICE_PUT_BLOCKING(level, uct_device_ep_put_single, device_ep,
171+
uct_elem, address, remote_address, length,
172+
flags, comp);
154173
}
155174

156175

@@ -225,6 +244,7 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
225244
*
226245
* The routine returns a request that can be progressed and checked for
227246
* completion with @ref ucp_device_progress_req.
247+
* The routine returns only after all the messages have been posted or an error has occurred.
228248
*
229249
* This routine can be called repeatedly with the same handle and different
230250
* @a addresses, @a lengths and counter related parameters. The @a flags
@@ -261,11 +281,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
261281
return status;
262282
}
263283

264-
return uct_device_ep_put_multi<level>(device_ep, uct_mem_list,
265-
mem_list_h->mem_list_length,
266-
addresses, remote_addresses, lengths,
267-
counter_inc_value,
268-
counter_remote_address, flags, comp);
284+
return UCP_DEVICE_PUT_BLOCKING(level, uct_device_ep_put_multi, device_ep,
285+
uct_mem_list, mem_list_h->mem_list_length,
286+
addresses, remote_addresses, lengths,
287+
counter_inc_value, counter_remote_address,
288+
flags, comp);
269289
}
270290

271291

@@ -292,6 +312,7 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
292312
*
293313
* The routine returns a request that can be progressed and checked for
294314
* completion with @ref ucp_device_progress_req.
315+
* The routine returns only after all the messages have been posted or an error has occurred.
295316
*
296317
* This routine can be called repeatedly with the same handle and different
297318
* mem_list_indices, addresses, lengths and increment related parameters. The
@@ -334,10 +355,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial(
334355
return status;
335356
}
336357

337-
return uct_device_ep_put_multi_partial<level>(
338-
device_ep, uct_mem_list, mem_list_indices, mem_list_count,
339-
addresses, remote_addresses, lengths, counter_index,
340-
counter_inc_value, counter_remote_address, flags, comp);
358+
return UCP_DEVICE_PUT_BLOCKING(level, uct_device_ep_put_multi_partial,
359+
device_ep, uct_mem_list, mem_list_indices,
360+
mem_list_count, addresses, remote_addresses,
361+
lengths, counter_index, counter_inc_value,
362+
counter_remote_address, flags, comp);
341363
}
342364

343365

@@ -364,6 +386,28 @@ UCS_F_DEVICE uint64_t ucp_device_counter_read(const void *counter_ptr)
364386
}
365387

366388

389+
/**
390+
* @ingroup UCP_DEVICE
391+
* @brief Write value to the counter memory area.
392+
*
393+
* This function can be used to set counter to a specific value.
394+
*
395+
* The counter memory area must be initialized with the host function
396+
* @ref ucp_device_counter_init.
397+
*
398+
* @tparam level Level of cooperation of the transfer.
399+
* @param [in] counter_ptr Counter memory area.
400+
* @param [in] value Value to write.
401+
*
402+
*/
403+
template<ucs_device_level_t level = UCS_DEVICE_LEVEL_THREAD>
404+
UCS_F_DEVICE void ucp_device_counter_write(void *counter_ptr, uint64_t value)
405+
{
406+
return ucs_device_atomic64_write(
407+
reinterpret_cast<uint64_t*>(counter_ptr), value);
408+
}
409+
410+
367411
/**
368412
* @ingroup UCP_DEVICE
369413
* @brief Progress a device request containing a batch of operations.
@@ -390,7 +434,12 @@ UCS_F_DEVICE ucs_status_t ucp_device_progress_req(ucp_device_request_t *req)
390434
}
391435

392436
status = uct_device_ep_progress<level>(req->device_ep);
393-
return (status != UCS_OK ? status : UCS_INPROGRESS);
437+
if (status != UCS_OK) {
438+
return status;
439+
}
440+
441+
return (ucs_likely(req->comp.count == 0)) ? req->comp.status :
442+
UCS_INPROGRESS;
394443
}
395444

396445
#endif /* UCP_DEVICE_IMPL_H */

src/ucp/core/ucp_device.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,8 @@ static ucs_status_t ucp_device_mem_list_create_handle(
271271
}
272272

273273
if (i == 0) {
274-
ucs_error("failed to select lane");
274+
ucs_error("failed to select lane for local device %s",
275+
ucs_topo_sys_device_get_name(local_sys_dev));
275276
return UCS_ERR_NO_RESOURCE;
276277
}
277278

src/ucp/core/ucp_ep.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ ucp_ep_peer_mem_get(ucp_context_h context, ucp_ep_h ep, uint64_t address,
321321

322322
data->size = size;
323323
ucp_ep_rkey_unpack_internal(ep, rkey_buf, 0, UCS_BIT(rkey_ptr_md_index), 0,
324-
UCS_SYS_DEVICE_ID_UNKNOWN, 1, &data->rkey);
324+
UCS_SYS_DEVICE_ID_UNKNOWN, &data->rkey);
325325
rkey_index = ucs_bitmap2idx(data->rkey->md_map, rkey_ptr_md_index);
326326
status = uct_rkey_ptr(data->rkey->tl_rkey[rkey_index].cmpt,
327327
&data->rkey->tl_rkey[rkey_index].rkey, address,

0 commit comments

Comments
 (0)