Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params &params, ucx_perf_counter_t idx,
ucp_device_request_t &req)
{
ucs_status_t status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
if (status != UCS_OK) {
if (UCS_STATUS_IS_ERR(status)) {
return status;
}

Expand Down Expand Up @@ -262,7 +262,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx,

ucp_device_request_t &req = request_mgr.get_request();
status = ucp_perf_cuda_send_nbx<level, cmd>(params, idx, req);
if (status != UCS_OK) {
if (UCS_STATUS_IS_ERR(status)) {
ucs_device_error("send failed: %d", status);
goto out;
}
Expand Down
53 changes: 26 additions & 27 deletions src/ucp/api/device/ucp_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
*/
typedef struct ucp_device_request {
uct_device_completion_t comp;
ucs_status_t status;
uct_device_ep_h device_ep;
} ucp_device_request_t;

Expand Down Expand Up @@ -51,9 +52,6 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep,
if (req != nullptr) {
comp = &req->comp;
req->device_ep = device_ep;
uct_device_completion_init(comp);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can also remove the TODO comment

/* TODO: Handle multiple device posts with same req? */
++comp->count;
} else {
comp = nullptr;
}
Expand All @@ -63,16 +61,20 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep,
/**
* Macro for device put operations with retry logic
*/
#define UCP_DEVICE_SEND_BLOCKING(_level, _uct_device_ep_send, _device_ep, ...) \
#define UCP_DEVICE_SEND_BLOCKING(_level, _uct_device_ep_send, _device_ep, \
_req, ...) \
({ \
ucs_status_t _status; \
do { \
_status = _uct_device_ep_send<_level>(_device_ep, __VA_ARGS__); \
if (_status != UCS_ERR_NO_RESOURCE) { \
break; \
} \
_status = uct_device_ep_progress<_level>(_device_ep); \
} while (!UCS_STATUS_IS_ERR(_status)); \
uct_device_ep_progress<_level>(_device_ep); \
} while (1); \
if (_req != nullptr) { \
_req->status = _status; \
} \
_status; \
})

Expand Down Expand Up @@ -148,8 +150,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single(
}

return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_single, device_ep,
uct_elem, address, remote_address, length,
flags, comp);
req, uct_elem, address, remote_address,
length, flags, comp);
}


Expand Down Expand Up @@ -199,8 +201,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
}

return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_atomic_add, device_ep,
uct_elem, inc_value, remote_address, flags,
comp);
req, uct_elem, inc_value, remote_address,
flags, comp);
}


Expand Down Expand Up @@ -263,8 +265,9 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
}

return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi, device_ep,
uct_mem_list, mem_list_h->mem_list_length,
addresses, remote_addresses, lengths,
req, uct_mem_list,
mem_list_h->mem_list_length, addresses,
remote_addresses, lengths,
counter_inc_value, counter_remote_address,
flags, comp);
}
Expand Down Expand Up @@ -338,10 +341,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial(
}

return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi_partial,
device_ep, uct_mem_list, mem_list_indices,
mem_list_count, addresses, remote_addresses,
lengths, counter_index, counter_inc_value,
counter_remote_address, flags, comp);
device_ep, req, uct_mem_list,
mem_list_indices, mem_list_count, addresses,
remote_addresses, lengths, counter_index,
counter_inc_value, counter_remote_address,
flags, comp);
}


Expand Down Expand Up @@ -409,19 +413,14 @@ UCS_F_DEVICE void ucp_device_counter_write(void *counter_ptr, uint64_t value)
template<ucs_device_level_t level = UCS_DEVICE_LEVEL_THREAD>
UCS_F_DEVICE ucs_status_t ucp_device_progress_req(ucp_device_request_t *req)
{
ucs_status_t status;

if (ucs_likely(req->comp.count == 0)) {
return req->comp.status;
}

status = uct_device_ep_progress<level>(req->device_ep);
if (status != UCS_OK) {
return status;
if (ucs_likely(req->status != UCS_INPROGRESS)) {
return req->status;
}

return (ucs_likely(req->comp.count == 0)) ? req->comp.status :
UCS_INPROGRESS;
uct_device_ep_progress<level>(req->device_ep);
req->status = uct_device_ep_check_completion<level>(req->device_ep,
&req->comp);
return req->status;
}

#endif /* UCP_DEVICE_IMPL_H */
38 changes: 23 additions & 15 deletions src/uct/api/device/uct_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@

#include <uct/ib/mlx5/gdaki/gdaki.cuh>

union uct_device_completion {
uct_rc_gda_completion_t rc_gda;
uct_cuda_ipc_completion_t cuda_ipc;
};


/**
* @ingroup UCT_DEVICE
Expand Down Expand Up @@ -242,34 +247,37 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial(
* @brief Progress all operations on device endpoint @a device_ep.
*
* @param [in] device_ep Device endpoint to be used for the operation.
*
* @return UCS_OK - Some operation was completed.
* @return UCS_INPROGRESS - No progress on the endpoint.
* @return Error code as defined by @ref ucs_status_t
*/
template<ucs_device_level_t level>
UCS_F_DEVICE ucs_status_t uct_device_ep_progress(uct_device_ep_h device_ep)
UCS_F_DEVICE void uct_device_ep_progress(uct_device_ep_h device_ep)
{
if (device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA) {
return uct_rc_mlx5_gda_ep_progress<level>(device_ep);
} else if (device_ep->uct_tl_id == UCT_DEVICE_TL_CUDA_IPC) {
return UCS_OK;
uct_rc_mlx5_gda_ep_progress<level>(device_ep);
}

return UCS_ERR_UNSUPPORTED;
}


/**
* @ingroup UCT_DEVICE
* @brief Initialize a device completion object.
* @brief Check whether opetation executed on device endpoint @a device_ep was
* completed.
*
* @param [in] device_ep Device endpoint to be used for the operation.
* @param [in] comp Completion object tracking operation progress.
*
* @param [out] comp Device completion object to initialize.
* @return UCS_OK - Some operation was completed.
* @return UCS_INPROGRESS - No progress on the endpoint.
* @return Error code as defined by @ref ucs_status_t
*/
UCS_F_DEVICE void uct_device_completion_init(uct_device_completion_t *comp)
template<ucs_device_level_t level>
UCS_F_DEVICE ucs_status_t uct_device_ep_check_completion(
uct_device_ep_h device_ep, uct_device_completion_t *comp)
{
comp->count = 0;
comp->status = UCS_OK;
if (device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA) {
return uct_rc_mlx5_gda_ep_check_completion<level>(device_ep, comp);
}

return UCS_ERR_UNSUPPORTED;
}

#endif
5 changes: 1 addition & 4 deletions src/uct/api/device/uct_device_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@ typedef struct uct_device_ep {


/* Completion object for device operations */
typedef struct uct_device_completion {
uint32_t count; /* How many operations are pending */
ucs_status_t status; /* Status of the operation */
} uct_device_completion_t;
typedef union uct_device_completion uct_device_completion_t;


/* Base structure for all device memory elements */
Expand Down
14 changes: 0 additions & 14 deletions src/uct/cuda/cuda_ipc/cuda_ipc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,6 @@ uct_cuda_ipc_ep_put_single(uct_device_ep_h device_ep,
mapped_rem_addr = uct_cuda_ipc_map_remote(cuda_ipc_mem_element, remote_address);
uct_cuda_ipc_copy_level<level>(mapped_rem_addr, address, length);
uct_cuda_ipc_level_sync<level>();
--comp->count;

return UCS_OK;
}

Expand Down Expand Up @@ -339,10 +337,6 @@ uct_cuda_ipc_ep_put_multi(uct_device_ep_h device_ep,
}

uct_cuda_ipc_level_sync<level>();
if (lane_id == 0) {
--comp->count;
}

return UCS_OK;
}

Expand Down Expand Up @@ -376,10 +370,6 @@ uct_cuda_ipc_ep_put_multi_partial(uct_device_ep_h device_ep,
}

uct_cuda_ipc_level_sync<level>();
if (lane_id == 0) {
--comp->count;
}

return UCS_OK;
}

Expand All @@ -403,10 +393,6 @@ uct_cuda_ipc_ep_atomic_add(uct_device_ep_h device_ep,
}

uct_cuda_ipc_level_sync<level>();
if (lane_id == 0) {
--comp->count;
}

return UCS_OK;
}

Expand Down
4 changes: 4 additions & 0 deletions src/uct/cuda/cuda_ipc/cuda_ipc_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ typedef struct {
ptrdiff_t mapped_offset;
} uct_cuda_ipc_device_mem_element_t;


typedef struct {
} uct_cuda_ipc_completion_t;

#endif
12 changes: 5 additions & 7 deletions src/uct/ib/mlx5/gdaki/gdaki.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
uct_ib_mlx5_wq_calc_sizes(&qp_attr);

cq_attr.flags |= UCT_IB_MLX5_CQ_IGNORE_OVERRUN;
cq_attr.umem_offset = ucs_align_up_pow2(
sizeof(uct_rc_gdaki_dev_ep_t) +
qp_attr.max_tx * sizeof(uct_rc_gdaki_op_t),
ucs_get_page_size());
cq_attr.umem_offset = ucs_align_up_pow2(sizeof(uct_rc_gdaki_dev_ep_t),
ucs_get_page_size());

qp_attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB;
qp_attr.super.srq_num = 0;
Expand All @@ -109,9 +107,9 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
dev_ep_size = qp_attr.umem_offset + qp_attr.len;
/*
* dev_ep layout:
* +---------------------+-------+---------+---------+
* | counters, dbr | ops | cq buff | wq buff |
* +---------------------+-------+---------+---------+
* +---------------------+---------+---------+
* | counters, dbr | cq buff | wq buff |
* +---------------------+---------+---------+
*/
status = uct_rc_gdaki_alloc(dev_ep_size, ucs_get_page_size(),
(void**)&self->ep_gpu, &self->ep_raw);
Expand Down
Loading
Loading