diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 5b7afaf911c..49f6aafb2d2 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -228,7 +228,7 @@ ucp_perf_cuda_send_sync(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, ucp_device_request_t &req) { ucs_status_t status = ucp_perf_cuda_send_nbx(params, idx, req); - if (status != UCS_OK) { + if (UCS_STATUS_IS_ERR(status)) { return status; } @@ -262,7 +262,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx, ucp_device_request_t &req = request_mgr.get_request(); status = ucp_perf_cuda_send_nbx(params, idx, req); - if (status != UCS_OK) { + if (UCS_STATUS_IS_ERR(status)) { ucs_device_error("send failed: %d", status); goto out; } diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index 5c3ece9f71a..bc430d3b9fb 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -24,6 +24,7 @@ */ typedef struct ucp_device_request { uct_device_completion_t comp; + ucs_status_t status; uct_device_ep_h device_ep; } ucp_device_request_t; @@ -51,9 +52,6 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep, if (req != nullptr) { comp = &req->comp; req->device_ep = device_ep; - uct_device_completion_init(comp); - /* TODO: Handle multiple device posts with same req? */ - ++comp->count; } else { comp = nullptr; } @@ -63,7 +61,8 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep, /** * Macro for device put operations with retry logic */ -#define UCP_DEVICE_SEND_BLOCKING(_level, _uct_device_ep_send, _device_ep, ...) \ +#define UCP_DEVICE_SEND_BLOCKING(_level, _uct_device_ep_send, _device_ep, \ + _req, ...) \ ({ \ ucs_status_t _status; \ do { \ @@ -71,8 +70,11 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep, if (_status != UCS_ERR_NO_RESOURCE) { \ break; \ } \ - _status = uct_device_ep_progress<_level>(_device_ep); \ - } while (!UCS_STATUS_IS_ERR(_status)); \ + uct_device_ep_progress<_level>(_device_ep); \ + } while (1); \ + if (_req != nullptr) { \ + _req->status = _status; \ + } \ _status; \ }) @@ -148,8 +150,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single( } return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_single, device_ep, - uct_elem, address, remote_address, length, - flags, comp); + req, uct_elem, address, remote_address, + length, flags, comp); } @@ -199,8 +201,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc( } return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_atomic_add, device_ep, - uct_elem, inc_value, remote_address, flags, - comp); + req, uct_elem, inc_value, remote_address, + flags, comp); } @@ -263,8 +265,9 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( } return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi, device_ep, - uct_mem_list, mem_list_h->mem_list_length, - addresses, remote_addresses, lengths, + req, uct_mem_list, + mem_list_h->mem_list_length, addresses, + remote_addresses, lengths, counter_inc_value, counter_remote_address, flags, comp); } @@ -338,10 +341,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial( } return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi_partial, - device_ep, uct_mem_list, mem_list_indices, - mem_list_count, addresses, remote_addresses, - lengths, counter_index, counter_inc_value, - counter_remote_address, flags, comp); + device_ep, req, uct_mem_list, + mem_list_indices, mem_list_count, addresses, + remote_addresses, lengths, counter_index, + counter_inc_value, counter_remote_address, + flags, comp); } @@ -409,19 +413,14 @@ UCS_F_DEVICE void ucp_device_counter_write(void *counter_ptr, uint64_t value) template UCS_F_DEVICE ucs_status_t ucp_device_progress_req(ucp_device_request_t *req) { - ucs_status_t status; - - if (ucs_likely(req->comp.count == 0)) { - return req->comp.status; - } - - status = uct_device_ep_progress(req->device_ep); - if (status != UCS_OK) { - return status; + if (ucs_likely(req->status != UCS_INPROGRESS)) { + return req->status; } - return (ucs_likely(req->comp.count == 0)) ? req->comp.status : - UCS_INPROGRESS; + uct_device_ep_progress(req->device_ep); + req->status = uct_device_ep_check_completion(req->device_ep, + &req->comp); + return req->status; } #endif /* UCP_DEVICE_IMPL_H */ diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h index fa0d3e0f656..4f014a03045 100644 --- a/src/uct/api/device/uct_device_impl.h +++ b/src/uct/api/device/uct_device_impl.h @@ -15,6 +15,11 @@ #include +union uct_device_completion { + uct_rc_gda_completion_t rc_gda; + uct_cuda_ipc_completion_t cuda_ipc; +}; + /** * @ingroup UCT_DEVICE @@ -242,34 +247,37 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial( * @brief Progress all operations on device endpoint @a device_ep. * * @param [in] device_ep Device endpoint to be used for the operation. - * - * @return UCS_OK - Some operation was completed. - * @return UCS_INPROGRESS - No progress on the endpoint. - * @return Error code as defined by @ref ucs_status_t */ template -UCS_F_DEVICE ucs_status_t uct_device_ep_progress(uct_device_ep_h device_ep) +UCS_F_DEVICE void uct_device_ep_progress(uct_device_ep_h device_ep) { if (device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA) { - return uct_rc_mlx5_gda_ep_progress(device_ep); - } else if (device_ep->uct_tl_id == UCT_DEVICE_TL_CUDA_IPC) { - return UCS_OK; + uct_rc_mlx5_gda_ep_progress(device_ep); } - - return UCS_ERR_UNSUPPORTED; } /** * @ingroup UCT_DEVICE - * @brief Initialize a device completion object. + * @brief Check whether opetation executed on device endpoint @a device_ep was + * completed. + * + * @param [in] device_ep Device endpoint to be used for the operation. + * @param [in] comp Completion object tracking operation progress. * - * @param [out] comp Device completion object to initialize. + * @return UCS_OK - Some operation was completed. + * @return UCS_INPROGRESS - No progress on the endpoint. + * @return Error code as defined by @ref ucs_status_t */ -UCS_F_DEVICE void uct_device_completion_init(uct_device_completion_t *comp) +template +UCS_F_DEVICE ucs_status_t uct_device_ep_check_completion( + uct_device_ep_h device_ep, uct_device_completion_t *comp) { - comp->count = 0; - comp->status = UCS_OK; + if (device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA) { + return uct_rc_mlx5_gda_ep_check_completion(device_ep, comp); + } + + return UCS_ERR_UNSUPPORTED; } #endif diff --git a/src/uct/api/device/uct_device_types.h b/src/uct/api/device/uct_device_types.h index eec4af75001..d99701f06b1 100644 --- a/src/uct/api/device/uct_device_types.h +++ b/src/uct/api/device/uct_device_types.h @@ -42,10 +42,7 @@ typedef struct uct_device_ep { /* Completion object for device operations */ -typedef struct uct_device_completion { - uint32_t count; /* How many operations are pending */ - ucs_status_t status; /* Status of the operation */ -} uct_device_completion_t; +typedef union uct_device_completion uct_device_completion_t; /* Base structure for all device memory elements */ diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh index 2ce91b7bdc3..f3a9499e9cf 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh +++ b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh @@ -304,8 +304,6 @@ uct_cuda_ipc_ep_put_single(uct_device_ep_h device_ep, mapped_rem_addr = uct_cuda_ipc_map_remote(cuda_ipc_mem_element, remote_address); uct_cuda_ipc_copy_level(mapped_rem_addr, address, length); uct_cuda_ipc_level_sync(); - --comp->count; - return UCS_OK; } @@ -339,10 +337,6 @@ uct_cuda_ipc_ep_put_multi(uct_device_ep_h device_ep, } uct_cuda_ipc_level_sync(); - if (lane_id == 0) { - --comp->count; - } - return UCS_OK; } @@ -376,10 +370,6 @@ uct_cuda_ipc_ep_put_multi_partial(uct_device_ep_h device_ep, } uct_cuda_ipc_level_sync(); - if (lane_id == 0) { - --comp->count; - } - return UCS_OK; } @@ -403,10 +393,6 @@ uct_cuda_ipc_ep_atomic_add(uct_device_ep_h device_ep, } uct_cuda_ipc_level_sync(); - if (lane_id == 0) { - --comp->count; - } - return UCS_OK; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_device.h b/src/uct/cuda/cuda_ipc/cuda_ipc_device.h index 3cba3f7fb3a..6a9f1f74e35 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_device.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_device.h @@ -12,4 +12,8 @@ typedef struct { ptrdiff_t mapped_offset; } uct_cuda_ipc_device_mem_element_t; + +typedef struct { +} uct_cuda_ipc_completion_t; + #endif diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c index 5fcd72c9012..e29a04cb486 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.c +++ b/src/uct/ib/mlx5/gdaki/gdaki.c @@ -92,10 +92,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params) uct_ib_mlx5_wq_calc_sizes(&qp_attr); cq_attr.flags |= UCT_IB_MLX5_CQ_IGNORE_OVERRUN; - cq_attr.umem_offset = ucs_align_up_pow2( - sizeof(uct_rc_gdaki_dev_ep_t) + - qp_attr.max_tx * sizeof(uct_rc_gdaki_op_t), - ucs_get_page_size()); + cq_attr.umem_offset = ucs_align_up_pow2(sizeof(uct_rc_gdaki_dev_ep_t), + ucs_get_page_size()); qp_attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB; qp_attr.super.srq_num = 0; @@ -109,9 +107,9 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params) dev_ep_size = qp_attr.umem_offset + qp_attr.len; /* * dev_ep layout: - * +---------------------+-------+---------+---------+ - * | counters, dbr | ops | cq buff | wq buff | - * +---------------------+-------+---------+---------+ + * +---------------------+---------+---------+ + * | counters, dbr | cq buff | wq buff | + * +---------------------+---------+---------+ */ status = uct_rc_gdaki_alloc(dev_ep_size, ucs_get_page_size(), (void**)&self->ep_gpu, &self->ep_raw); diff --git a/src/uct/ib/mlx5/gdaki/gdaki.cuh b/src/uct/ib/mlx5/gdaki/gdaki.cuh index 3c6f43b7241..09357f22081 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.cuh +++ b/src/uct/ib/mlx5/gdaki/gdaki.cuh @@ -13,6 +13,8 @@ #include #define UCT_RC_GDA_RESV_WQE_NO_RESOURCE -1ULL +#define UCT_RC_GDA_WQE_ERR UCS_BIT(63) +#define UCT_RC_GDA_WQE_MASK UCS_MASK(63) UCS_F_DEVICE void * @@ -21,8 +23,7 @@ uct_rc_mlx5_gda_get_wqe_ptr(uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx) const uint16_t nwqes_mask = __ldg(&ep->sq_wqe_num) - 1; const uintptr_t wqe_addr = __ldg((uintptr_t*)&ep->sq_wqe_daddr); const uint16_t idx = wqe_idx & nwqes_mask; - return (struct doca_gpu_dev_verbs_wqe - *)(wqe_addr + (idx << DOCA_GPUNETIO_MLX5_WQE_SQ_SHIFT)); + return (void*)(wqe_addr + (idx << DOCA_GPUNETIO_MLX5_WQE_SQ_SHIFT)); } UCS_F_DEVICE void @@ -233,30 +234,35 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_single( uct_rc_gdaki_dev_ep_t *ep, const uct_device_mem_element_t *tl_mem_elem, const void *address, uint32_t lkey, uint64_t remote_address, uint32_t rkey, size_t length, uint64_t flags, - uct_device_completion_t *comp, uint32_t opcode, bool is_atomic, + uct_device_completion_t *tl_comp, uint32_t opcode, bool is_atomic, uint64_t add) { + uct_rc_gda_completion_t *comp = &tl_comp->rc_gda; unsigned cflag = 0; + uint64_t wqe_base; uint64_t wqe_idx; unsigned lane_id; unsigned num_lanes; uint32_t fc; uct_rc_mlx5_gda_exec_init(lane_id, num_lanes); - uct_rc_mlx5_gda_reserv_wqe(ep, 1, lane_id, wqe_idx); - if (wqe_idx == UCT_RC_GDA_RESV_WQE_NO_RESOURCE) { + uct_rc_mlx5_gda_reserv_wqe(ep, 1, lane_id, wqe_base); + if (wqe_base == UCT_RC_GDA_RESV_WQE_NO_RESOURCE) { return UCS_ERR_NO_RESOURCE; } fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2); + wqe_idx = wqe_base & 0xffff; if (lane_id == 0) { if ((comp != nullptr) || (wqe_idx == fc)) { cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE; - ep->ops[wqe_idx & (ep->sq_wqe_num - 1)].comp = comp; + if (comp != nullptr) { + comp->wqe_idx = wqe_base; + } } uct_rc_mlx5_gda_wqe_prepare_put_or_atomic( - ep, uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx), wqe_idx & 0xffff, + ep, uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx), wqe_idx, opcode, cflag, remote_address, rkey, reinterpret_cast(address), lkey, length, is_atomic, add); @@ -265,11 +271,11 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_single( uct_rc_mlx5_gda_sync(); if (lane_id == 0) { - uct_rc_mlx5_gda_db(ep, wqe_idx, 1, flags); + uct_rc_mlx5_gda_db(ep, wqe_base, 1, flags); } uct_rc_mlx5_gda_sync(); - return UCS_OK; + return UCS_INPROGRESS; } template @@ -311,11 +317,13 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( unsigned mem_list_count, void *const *addresses, const uint64_t *remote_addresses, const size_t *lengths, uint64_t counter_inc_value, uint64_t counter_remote_address, - uint64_t flags, uct_device_completion_t *comp) + uint64_t flags, uct_device_completion_t *tl_comp) { auto ep = reinterpret_cast(tl_ep); auto mem_list = reinterpret_cast( tl_mem_list); + uct_rc_gda_completion_t *comp = &tl_comp->rc_gda; + int count = mem_list_count; int counter_index = count - 1; bool atomic = false; @@ -371,7 +379,9 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( if (((comp != nullptr) && (i == count - 1)) || ((comp == nullptr) && (wqe_idx == fc))) { cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE; - ep->ops[wqe_idx & (ep->sq_wqe_num - 1)].comp = comp; + if (comp != nullptr) { + comp->wqe_idx = wqe_base; + } } auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); @@ -391,7 +401,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( } uct_rc_mlx5_gda_sync(); - return UCS_OK; + return UCS_INPROGRESS; } template @@ -401,11 +411,12 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( void *const *addresses, const uint64_t *remote_addresses, const size_t *lengths, unsigned counter_index, uint64_t counter_inc_value, uint64_t counter_remote_address, - uint64_t flags, uct_device_completion_t *comp) + uint64_t flags, uct_device_completion_t *tl_comp) { auto ep = reinterpret_cast(tl_ep); auto mem_list = reinterpret_cast( tl_mem_list); + uct_rc_gda_completion_t *comp = &tl_comp->rc_gda; unsigned count = mem_list_count; bool atomic = false; uint64_t wqe_idx; @@ -463,7 +474,9 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( if (((comp != nullptr) && (i == count - 1)) || ((comp == nullptr) && (wqe_idx == fc))) { cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE; - ep->ops[wqe_idx & (ep->sq_wqe_num - 1)].comp = comp; + if (comp != nullptr) { + comp->wqe_idx = wqe_base; + } } auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); @@ -483,7 +496,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( } uct_rc_mlx5_gda_sync(); - return UCS_OK; + return UCS_INPROGRESS; } UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16(uint16_t x) @@ -517,8 +530,7 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len) } } -UCS_F_DEVICE ucs_status_t -uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep) +UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep) { void *cqe = ep->cqe_daddr; size_t cqe_num = ep->cqe_num; @@ -531,75 +543,73 @@ uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep) op_owner = READ_ONCE(cqe64->op_own); if ((op_owner & MLX5_CQE_OWNER_MASK) ^ !!(cqe_idx & cqe_num)) { - return UCS_INPROGRESS; + return; } cuda::atomic_ref ref(ep->cqe_ci); if (!ref.compare_exchange_strong(cqe_idx, cqe_idx + 1, cuda::std::memory_order_relaxed)) { - return UCS_OK; + return; } uint8_t opcode = op_owner >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT; uint16_t wqe_cnt = uct_rc_mlx5_gda_bswap16(cqe64->wqe_counter); uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1); - if (opcode == MLX5_CQE_REQ_ERR) { - auto err_cqe = reinterpret_cast(cqe64); - auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); - ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x " - "wqe_idx:0x%x qp:0x%x", - idx, err_cqe->syndrome, err_cqe->vendor_err_synd, - err_cqe->hw_err_synd, wqe_idx, - doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) & - 0xffffff); - uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64); - uct_rc_mlx5_gda_qedump("CQE", cqe64, 64); - return UCS_ERR_IO_ERROR; - } - - if (ep->ops[wqe_idx].comp != nullptr) { - ep->ops[wqe_idx].comp->count--; // TODO maybe atomic? - } - cuda::atomic_ref pi_ref(ep->sq_wqe_pi); uint64_t sq_wqe_pi = ep->sq_wqe_pi; - pi_ref.fetch_max(((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi + 1); + sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi + 1; - doca_gpu_dev_verbs_fence_release(); - return UCS_OK; + if (opcode == MLX5_CQE_REQ) { + pi_ref.fetch_max(sq_wqe_pi); + return; + } + + auto err_cqe = reinterpret_cast(cqe64); + auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx); + ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x " + "wqe_idx:0x%x qp:0x%x", + idx, err_cqe->syndrome, err_cqe->vendor_err_synd, + err_cqe->hw_err_synd, wqe_idx, + doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) & + 0xffffff); + uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64); + uct_rc_mlx5_gda_qedump("CQE", cqe64, 64); + pi_ref.fetch_max(sq_wqe_pi | UCT_RC_GDA_WQE_ERR); } template -UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep) +UCS_F_DEVICE void uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep) { uct_rc_gdaki_dev_ep_t *ep = (uct_rc_gdaki_dev_ep_t*)tl_ep; + unsigned num_lanes; + unsigned lane_id; - if (level == UCS_DEVICE_LEVEL_BLOCK) { - __shared__ ucs_status_t status; + uct_rc_mlx5_gda_exec_init(lane_id, num_lanes); + if (lane_id == 0) { + uct_rc_mlx5_gda_progress_thread(ep); + } - if (threadIdx.x == 0) { - status = uct_rc_mlx5_gda_progress_thread(ep); - } + uct_rc_mlx5_gda_sync(); +} - __syncthreads(); - return status; - } else if (level == UCS_DEVICE_LEVEL_WARP) { - unsigned lane_id = doca_gpu_dev_verbs_get_lane_id(); - ucs_status_t status; +template +UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion( + uct_device_ep_h tl_ep, uct_device_completion_t *tl_comp) +{ + uct_rc_gdaki_dev_ep_t *ep = reinterpret_cast(tl_ep); + uct_rc_gda_completion_t *comp = &tl_comp->rc_gda; + uint64_t sq_wqe_pi = ep->sq_wqe_pi; - if (lane_id == 0) { - status = uct_rc_mlx5_gda_progress_thread(ep); - } + if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) <= comp->wqe_idx) { + return UCS_INPROGRESS; + } - status = (ucs_status_t)__shfl_sync(0xffffffff, status, 0); - __syncwarp(); - return status; - } else if (level == UCS_DEVICE_LEVEL_THREAD) { - return uct_rc_mlx5_gda_progress_thread(ep); - } else { - return UCS_ERR_UNSUPPORTED; + if (sq_wqe_pi & UCT_RC_GDA_WQE_ERR) { + return UCS_ERR_IO_ERROR; } + + return UCS_OK; } #endif diff --git a/src/uct/ib/mlx5/gdaki/gdaki_dev.h b/src/uct/ib/mlx5/gdaki/gdaki_dev.h index 182d8bdca64..d4a3a710c47 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki_dev.h +++ b/src/uct/ib/mlx5/gdaki/gdaki_dev.h @@ -8,10 +8,6 @@ #include -typedef struct { - uct_device_completion_t *comp; -} uct_rc_gdaki_op_t; - typedef struct { uct_device_ep_t super; @@ -34,8 +30,6 @@ typedef struct { uint32_t cqe_num; uint16_t sq_wqe_num; uint32_t sq_num; - - uct_rc_gdaki_op_t ops[0]; } uct_rc_gdaki_dev_ep_t; @@ -44,4 +38,8 @@ typedef struct uct_rc_gdaki_device_mem_element { uint32_t rkey; } uct_rc_gdaki_device_mem_element_t; +typedef struct { + uint64_t wqe_idx; +} uct_rc_gda_completion_t; + #endif /* UCT_GDAKI_DEV_H */ diff --git a/test/gtest/ucp/cuda/test_kernels.cu b/test/gtest/ucp/cuda/test_kernels.cu index 55b18a56d6f..6d5041c9848 100644 --- a/test/gtest/ucp/cuda/test_kernels.cu +++ b/test/gtest/ucp/cuda/test_kernels.cu @@ -64,11 +64,14 @@ ucp_test_kernel_do_operation(const test_ucp_device_kernel_params_t ¶ms, return UCS_OK; } - if ((status != UCS_OK) || !(flags & UCT_DEVICE_FLAG_NODELAY) || - (req_ptr == nullptr)) { + if (UCS_STATUS_IS_ERR(status)) { return status; } + if (!(flags & UCT_DEVICE_FLAG_NODELAY) || (req_ptr == nullptr)) { + return UCS_OK; + } + do { status = ucp_device_progress_req(req_ptr); } while (status == UCS_INPROGRESS); diff --git a/test/gtest/uct/cuda/test_kernels.cu b/test/gtest/uct/cuda/test_kernels.cu index 468864cf1de..b44ee8efa52 100644 --- a/test/gtest/uct/cuda/test_kernels.cu +++ b/test/gtest/uct/cuda/test_kernels.cu @@ -19,19 +19,18 @@ uct_put_single_kernel(uct_device_ep_h ep, uct_device_mem_element_t *mem_elem, { uct_device_completion_t comp; - comp.count = 1; - comp.status = UCS_OK; ucs_status_t status = uct_device_ep_put_single( ep, mem_elem, va, rva, length, UCT_DEVICE_FLAG_NODELAY, &comp); - if (status != UCS_OK) { + if (status != UCS_INPROGRESS) { *status_p = status; return; } - while (comp.count != 0) { + while ((status = uct_device_ep_check_completion( + ep, &comp)) == UCS_INPROGRESS) { uct_device_ep_progress(ep); } - *status_p = UCS_OK; + *status_p = status; } /** @@ -55,19 +54,18 @@ uct_atomic_kernel(uct_device_ep_h ep, uct_device_mem_element_t *mem_elem, { uct_device_completion_t comp; - comp.count = 1; - comp.status = UCS_OK; ucs_status_t status = uct_device_ep_atomic_add( ep, mem_elem, add, rva, UCT_DEVICE_FLAG_NODELAY, &comp); - if (status != UCS_OK) { + if (status != UCS_INPROGRESS) { *status_p = status; return; } - while (comp.count != 0) { + while ((status = uct_device_ep_check_completion( + ep, &comp)) == UCS_INPROGRESS) { uct_device_ep_progress(ep); } - *status_p = UCS_OK; + *status_p = status; } /** @@ -105,20 +103,19 @@ uct_put_multi_kernel(uct_device_ep_h ep, uct_device_mem_element_t *mem_list, } __syncwarp(); - comp.count = 1; - comp.status = UCS_OK; - status = uct_device_ep_put_multi( + status = uct_device_ep_put_multi( ep, mem_list, iovcnt + 1, src, dst, sizes, 4, atomic_rva, UCT_DEVICE_FLAG_NODELAY, &comp); - if (status != UCS_OK) { + if (status != UCS_INPROGRESS) { *status_p = status; return; } - while (comp.count != 0) { + while ((status = uct_device_ep_check_completion( + ep, &comp)) == UCS_INPROGRESS) { uct_device_ep_progress(ep); } - *status_p = UCS_OK; + *status_p = status; } /** @@ -171,20 +168,19 @@ uct_put_partial_kernel(uct_device_ep_h ep, uct_device_mem_element_t *mem_list, } __syncwarp(); - comp.count = 1; - comp.status = UCS_OK; - status = uct_device_ep_put_multi_partial( + status = uct_device_ep_put_multi_partial( ep, mem_list, indices, iovcnt, src, dst, sizes, iovcnt, 4, atomic_rva, UCT_DEVICE_FLAG_NODELAY, &comp); - if (status != UCS_OK) { + if (status != UCS_INPROGRESS) { *status_p = status; return; } - while (comp.count != 0) { + while ((status = uct_device_ep_check_completion( + ep, &comp)) == UCS_INPROGRESS) { uct_device_ep_progress(ep); } - *status_p = UCS_OK; + *status_p = status; } /** diff --git a/test/gtest/uct/cuda/test_kernels_uct.cu b/test/gtest/uct/cuda/test_kernels_uct.cu index 0b845c6ae22..0e009f29489 100644 --- a/test/gtest/uct/cuda/test_kernels_uct.cu +++ b/test/gtest/uct/cuda/test_kernels_uct.cu @@ -107,7 +107,6 @@ template class device_result_ptr { uct_device_completion_t comp; if (is_op_enabled(level)) { - uct_device_completion_init(&comp); *status = uct_device_ep_put_single(device_ep, mem_elem, address, remote_address, length, 0, &comp); @@ -176,7 +175,6 @@ uct_atomic_kernel(uct_device_ep_h ep, uct_device_completion_t comp; if (is_op_enabled(level)) { - uct_device_completion_init(&comp); *status_p = uct_device_ep_atomic_add(ep, mem_elem, add, rva, UCT_DEVICE_FLAG_NODELAY, &comp); } @@ -235,7 +233,6 @@ uct_put_multi_kernel(uct_device_ep_h ep, uct_device_completion_t comp; if (is_op_enabled(level)) { - uct_device_completion_init(&comp); *status_p = uct_device_ep_put_multi(ep, mem_list, mem_list_count, addresses, remote_addresses, lengths, counter_inc_value, counter_remote_address, @@ -301,7 +298,6 @@ uct_put_multi_partial_kernel(uct_device_ep_h ep, uct_device_completion_t comp; if (is_op_enabled(level)) { - uct_device_completion_init(&comp); *status_p = uct_device_ep_put_multi_partial(ep, mem_list, mem_list_indices, mem_list_count, addresses, remote_addresses, lengths, counter_index, counter_inc_value, counter_remote_address,