diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index 628d5358e8e..229b15fbff9 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -657,7 +657,6 @@ run_ucx_perftest_cuda_device() { ucx_perftest="$ucx_inst/bin/ucx_perftest" ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda" - # TODO: Run on all GPUs & NICs combinations # TODO: Run on all GPUs & NICs combinations ucp_client_args="-a cuda:0 $(hostname)" gda_tls="cuda_copy,rc,rc_gda" diff --git a/contrib/ucx_perftest_config/test_types_ucp_device_cuda b/contrib/ucx_perftest_config/test_types_ucp_device_cuda index 86e24d125ae..99a7a1cebac 100644 --- a/contrib/ucx_perftest_config/test_types_ucp_device_cuda +++ b/contrib/ucx_perftest_config/test_types_ucp_device_cuda @@ -11,9 +11,10 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2 # Increase number of threads after following fixes: # - Use thread-local memory instead of shared for requests (limit 48K) # - Fix WQE size limit of 1024 -ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 -ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 -ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 -ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 -ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 -ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 +# TODO - enable when wqe reserve is fixed. +# ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 +# ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 +# ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 +# ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 +# ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2 +# ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2 diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu index 49f6aafb2d2..b7401bb3b53 100644 --- a/src/tools/perf/cuda/ucp_cuda_kernel.cu +++ b/src/tools/perf/cuda/ucp_cuda_kernel.cu @@ -80,10 +80,9 @@ struct ucp_perf_cuda_params { ucp_device_mem_list_handle_h mem_list; size_t length; unsigned *indices; - void **addresses; - uint64_t *remote_addresses; + size_t *local_offsets; + size_t *remote_offsets; size_t *lengths; - uint64_t counter_remote; uint64_t *counter_send; uint64_t *counter_recv; ucp_device_flags_t flags; @@ -102,8 +101,8 @@ public: { ucp_device_mem_list_release(m_params.mem_list); CUDA_CALL_WARN(cudaFree, m_params.indices); - CUDA_CALL_WARN(cudaFree, m_params.addresses); - CUDA_CALL_WARN(cudaFree, m_params.remote_addresses); + CUDA_CALL_WARN(cudaFree, m_params.local_offsets); + CUDA_CALL_WARN(cudaFree, m_params.remote_offsets); CUDA_CALL_WARN(cudaFree, m_params.lengths); } @@ -113,13 +112,23 @@ private: void init_mem_list(const ucx_perf_context_t &perf) { /* +1 for the counter */ - size_t count = perf.params.msg_size_cnt + 1; + size_t count = perf.params.msg_size_cnt + 1; + size_t offset = 0; ucp_device_mem_list_elem_t elems[count]; + for (size_t i = 0; i < count; ++i) { elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | - UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY; + UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH; elems[i].memh = perf.ucp.send_memh; elems[i].rkey = perf.ucp.rkey; + elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset); + elems[i].remote_addr = perf.ucp.remote_addr + offset; + elems[i].length = (i == count - 1) ? ONESIDED_SIGNAL_SIZE : + perf.params.msg_size_list[i]; + offset += elems[i].length; } ucp_device_mem_list_params_t params; @@ -140,33 +149,30 @@ private: void init_elements(const ucx_perf_context_t &perf) { /* +1 for the counter */ - size_t count = perf.params.msg_size_cnt + 1; + size_t count = perf.params.msg_size_cnt + 1; + size_t offset = 0; std::vector indices(count); - std::vector addresses(count); - std::vector remote_addresses(count); + std::vector local_offsets(count, 0); + std::vector remote_offsets(count, 0); std::vector lengths(count); - for (unsigned i = 0, offset = 0; i < count; ++i) { - indices[i] = i; - addresses[i] = (char *)perf.send_buffer + offset; - remote_addresses[i] = perf.ucp.remote_addr + offset; - lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE : - perf.params.msg_size_list[i]; - offset += lengths[i]; + + for (unsigned i = 0; i < count; ++i) { + indices[i] = i; + lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE : + perf.params.msg_size_list[i]; + offset += lengths[i]; } device_clone(&m_params.indices, indices.data(), count); - device_clone(&m_params.addresses, addresses.data(), count); - device_clone(&m_params.remote_addresses, remote_addresses.data(), count); + device_clone(&m_params.local_offsets, local_offsets.data(), count); + device_clone(&m_params.remote_offsets, remote_offsets.data(), count); device_clone(&m_params.lengths, lengths.data(), count); } void init_counters(const ucx_perf_context_t &perf) { m_params.length = ucx_perf_get_message_size(&perf.params); - m_params.counter_remote = (uint64_t)ucx_perf_cuda_get_sn( - (void*)perf.ucp.remote_addr, - m_params.length); m_params.counter_send = ucx_perf_cuda_get_sn(perf.send_buffer, m_params.length); m_params.counter_recv = ucx_perf_cuda_get_sn(perf.recv_buffer, @@ -195,28 +201,20 @@ ucp_perf_cuda_send_nbx(ucp_perf_cuda_params ¶ms, ucx_perf_counter_t idx, /* TODO: Change to ucp_device_counter_write */ *params.counter_send = idx + 1; return ucp_device_put_single(params.mem_list, params.indices[0], - params.addresses[0], - params.remote_addresses[0], - params.length + ONESIDED_SIGNAL_SIZE, - params.flags, &req); + 0, 0, + params.length + + ONESIDED_SIGNAL_SIZE, + 0, params.flags, &req); case UCX_PERF_CMD_PUT_MULTI: - return ucp_device_put_multi(params.mem_list, params.addresses, - params.remote_addresses, - params.lengths, 1, - params.counter_remote, params.flags, + return ucp_device_put_multi(params.mem_list, 1, 0, params.flags, &req); - case UCX_PERF_CMD_PUT_PARTIAL:{ + case UCX_PERF_CMD_PUT_PARTIAL: { unsigned counter_index = params.mem_list->mem_list_length - 1; - return ucp_device_put_multi_partial(params.mem_list, - params.indices, - counter_index, - params.addresses, - params.remote_addresses, - params.lengths, - counter_index, 1, - params.counter_remote, - params.flags, &req); - } + return ucp_device_put_multi_partial( + params.mem_list, params.indices, counter_index, + params.local_offsets, params.remote_offsets, params.lengths, + counter_index, 1, 0, 0, params.flags, &req); + } } return UCS_ERR_INVALID_PARAM; diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index bc430d3b9fb..e656f474bc7 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -26,6 +26,7 @@ typedef struct ucp_device_request { uct_device_completion_t comp; ucs_status_t status; uct_device_ep_h device_ep; + unsigned channel_id; } ucp_device_request_t; @@ -95,8 +96,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send( device_ep = mem_list_h->uct_device_eps[lane]; elem_offset = first_mem_elem_index * mem_list_h->uct_mem_element_size[lane]; - uct_elem = (uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_list_h + 1, - elem_offset); + uct_elem = (uct_device_mem_element_t*) + UCS_PTR_BYTE_OFFSET(mem_list_h->uct_mem_elements, elem_offset); ucp_device_request_init(device_ep, req, comp); return UCS_OK; @@ -109,24 +110,26 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send( * * This device routine posts one put operation using descriptor list handle. * The @a mem_list_index is used to point at the @a mem_list entry to be used - * for the memory transfer. The addresses and length must be valid for the used - * @a mem_list entry. + * for the memory transfer. The @a local_offset and @a remote_offset parameters + * specify byte offsets within the selected memory list entry. The @a length, + * @a local_offset and @a remote_offset parameters must be valid + * for the used @a mem_list entry. * * The routine returns a request that can be progressed and checked for * completion with @ref ucp_device_progress_req. * The routine returns only after the message has been posted or an error has occurred. * * This routine can be called repeatedly with the same handle and different - * addresses and length. The flags parameter can be used to modify the behavior + * offsets and length. The flags parameter can be used to modify the behavior * of the routine with bit from @ref ucp_device_flags_t. * * @tparam level Level of cooperation of the transfer. * @param [in] mem_list_h Memory descriptor list handle to use. * @param [in] mem_list_index Index in descriptor list pointing to the memory - * @param [in] address Local virtual address to send data from. - * @param [in] remote_address Remote virtual address to send data to. + * @param [in] local_offset Local offset to send data from. + * @param [in] remote_offset Remote offset to send data to. * @param [in] length Length in bytes of the data to send. - * registration keys to use for the transfer. + * @param [in] channel_id Channel ID to use for the transfer. * @param [in] flags Flags usable to modify the function behavior. * @param [out] req Request populated by the call. * @@ -135,9 +138,13 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send( template UCS_F_DEVICE ucs_status_t ucp_device_put_single( ucp_device_mem_list_handle_h mem_list_h, unsigned mem_list_index, - const void *address, uint64_t remote_address, size_t length, - uint64_t flags, ucp_device_request_t *req) + size_t local_offset, size_t remote_offset, size_t length, + unsigned channel_id, uint64_t flags, ucp_device_request_t *req) { + const void *address = UCS_PTR_BYTE_OFFSET( + mem_list_h->local_addrs[mem_list_index], local_offset); + const uint64_t remote_address = mem_list_h->remote_addrs[mem_list_index] + + remote_offset; const uct_device_mem_element_t *uct_elem; uct_device_completion_t *comp; uct_device_ep_t *device_ep; @@ -161,14 +168,14 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single( * * This device routine posts one increment operation using memory descriptor * list handle. The @ref mem_list_index is used to point at the @a mem_list - * entry to be used for the increment operation. The remote address must be + * entry to be used for the increment operation. The remote offset must be * valid for the used @a mem_list entry. * * The routine returns a request that can be progressed and checked for * completion with @ref ucp_device_progress_req. * * This routine can be called repeatedly with the same handle and different - * address. The flags parameter can be used to modify the behavior of the + * counter offset. The flags parameter can be used to modify the behavior of the * routine. * * @tparam level Level of cooperation of the transfer. @@ -176,8 +183,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single( * @param [in] mem_list_index Index in descriptor list pointing to the memory * remote key to use for the increment operation. * @param [in] inc_value Value used to increment the remote address. - * @param [in] remote_address Remote virtual address to perform the increment - * to. + * @param [in] remote_offset Remote offset to perform the increment to. + * @param [in] channel_id Channel ID to use for the transfer. * @param [in] flags Flags usable to modify the function behavior. * @param [out] req Request populated by the call. * @@ -186,9 +193,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single( template UCS_F_DEVICE ucs_status_t ucp_device_counter_inc( ucp_device_mem_list_handle_h mem_list_h, unsigned mem_list_index, - uint64_t inc_value, uint64_t remote_address, uint64_t flags, - ucp_device_request_t *req) + uint64_t inc_value, size_t remote_offset, unsigned channel_id, + uint64_t flags, ucp_device_request_t *req) { + uint64_t remote_address = mem_list_h->remote_addrs[mem_list_index] + + remote_offset; const uct_device_mem_element_t *uct_elem; uct_device_completion_t *comp; uct_device_ep_t *device_ep; @@ -211,36 +220,26 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc( * @brief Posts multiple put operations followed by one increment operation. * * This device routine posts a batch of put operations using the descriptor list - * entries in the input handle, followed by an increment operation. This - * operation can be polled on the receiver to detect completion of all the + * entries in the input handle, followed by an increment operation if @a counter_inc_value != 0. + * This operation can be polled on the receiver to detect completion of all the * operations of the batch, started during the same routine call. * - * The content of each entries in the arrays @a addresses, @a remote_addresses - * and @a lengths must be valid for each corresponding entry in the descriptor - * list from the input handle. The last entry in the descriptor list contains + * The last entry in the descriptor list contains * the remote memory registration descriptors to be used for the increment * operation. * - * The size of the arrays @a addresses, @a remote_addresses, and @a lengths - * are all equal to the size of the descriptor list array from the handle, - * minus one. - * * The routine returns a request that can be progressed and checked for * completion with @ref ucp_device_progress_req. * The routine returns only after all the messages have been posted or an error has occurred. * - * This routine can be called repeatedly with the same handle and different - * @a addresses, @a lengths and counter related parameters. The @a flags + * This routine can be called repeatedly. The @a flags * parameter can be used to modify the behavior of the routine with bit from * @ref ucp_device_flags_t. * * @tparam level Level of cooperation of the transfer. * @param [in] mem_list_h Memory descriptor list handle to use. - * @param [in] addresses Array of local addresses to send from. - * @param [in] remote_addresses Array of remote addresses to send to. - * @param [in] lengths Array of lengths in bytes for each send. * @param [in] counter_inc_value Value of the remote increment. - * @param [in] counter_remote_address Remote address to increment to. + * @param [in] channel_id Channel ID to use for the transfer. * @param [in] flags Flags to modify the function behavior. * @param [out] req Request populated by the call. * @@ -248,11 +247,14 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc( */ template UCS_F_DEVICE ucs_status_t ucp_device_put_multi( - ucp_device_mem_list_handle_h mem_list_h, void *const *addresses, - const uint64_t *remote_addresses, const size_t *lengths, - uint64_t counter_inc_value, uint64_t counter_remote_address, - uint64_t flags, ucp_device_request_t *req) + ucp_device_mem_list_handle_h mem_list_h, uint64_t counter_inc_value, + unsigned channel_id, uint64_t flags, ucp_device_request_t *req) { + void *const *addresses = mem_list_h->local_addrs; + const uint64_t *remote_addresses = mem_list_h->remote_addrs; + const size_t *lengths = mem_list_h->lengths; + uint64_t counter_remote_address = + mem_list_h->remote_addrs[mem_list_h->mem_list_length - 1]; const uct_device_mem_element_t *uct_mem_list; uct_device_completion_t *comp; uct_device_ep_t *device_ep; @@ -278,20 +280,20 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( * @brief Posts few put operations followed by one atomic increment operation. * * This device routine posts a batch of put operations using only some of the - * descriptor list entries in the input handle, followed by an operation. + * descriptor list entries in the input handle, followed by + * an increment operation if @a counter_inc_value != 0. * This increment operation can be polled on the receiver to detect completion * of all operations of the batch, started during the same routine call. * * The set of indices from the descriptor list entries to use are to be passed - * in the array @ref mem_list_indices. The last entry of the descriptor list is to - * be used for the final increment operation. + * in the array @ref mem_list_indices. * - * The content of each entries in the arrays addresses, remote_addresses and - * lengths must be valid for each corresponding descriptor list entry whose + * The content of each entries in the arrays @a local_offsets, @a remote_offsets + * and @a lengths must be valid for each corresponding descriptor list entry whose * index is referenced in @ref mem_list_indices. * - * The size of the arrays mem_list_indices, addresses, remote_addresses, and - * lengths are all equal. They are lower than the size of the descriptor list + * The size of the arrays @a mem_list_indices, @a local_offsets, @a remote_offsets, and + * @a lengths are all equal. They are lower than the size of the descriptor list * array from the handle. * * The routine returns a request that can be progressed and checked for @@ -299,8 +301,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( * The routine returns only after all the messages have been posted or an error has occurred. * * This routine can be called repeatedly with the same handle and different - * mem_list_indices, addresses, lengths and increment related parameters. The - * flags parameter can be used to modify the behavior of the routine with bit + * mem_list_indices, local_offsets, remote_offsets, lengths and increment related parameters. + * The @a flags parameter can be used to modify the behavior of the routine with bit * from @ref ucp_device_flags_t. * * @tparam level Level of cooperation of the transfer. @@ -309,12 +311,13 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( * list of entries from handle. * @param [in] mem_list_count Number of indices in the array @ref * mem_list_indices. - * @param [in] addresses Array of local addresses to send from. - * @param [in] remote_addresses Array of remote addresses to send to. + * @param [in] local_offsets Array of local offsets to send from. + * @param [in] remote_offsets Array of remote offsets to send to. * @param [in] lengths Array of lengths in bytes for each send. * @param [in] counter_index Index of remote increment descriptor. * @param [in] counter_inc_value Value of the remote increment. - * @param [in] counter_remote_address Remote address to increment to. + * @param [in] counter_remote_offset Remote offset to increment to. + * @param [in] channel_id Channel ID to use for the transfer. * @param [in] flags Flags to modify the function behavior. * @param [out] req Request populated by the call. * @@ -324,11 +327,15 @@ template UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial( ucp_device_mem_list_handle_h mem_list_h, const unsigned *mem_list_indices, unsigned mem_list_count, - void *const *addresses, const uint64_t *remote_addresses, + const size_t *local_offsets, const size_t *remote_offsets, const size_t *lengths, unsigned counter_index, - uint64_t counter_inc_value, uint64_t counter_remote_address, - uint64_t flags, ucp_device_request_t *req) + uint64_t counter_inc_value, size_t counter_remote_offset, + unsigned channel_id, uint64_t flags, ucp_device_request_t *req) { + void *const *addresses = mem_list_h->local_addrs; + const uint64_t *remote_addresses = mem_list_h->remote_addrs; + uint64_t counter_remote_address = mem_list_h->remote_addrs[counter_index] + + counter_remote_offset; const uct_device_mem_element_t *uct_mem_list; uct_device_completion_t *comp; uct_device_ep_t *device_ep; @@ -343,7 +350,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial( return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi_partial, device_ep, req, uct_mem_list, mem_list_indices, mem_list_count, addresses, - remote_addresses, lengths, counter_index, + remote_addresses, local_offsets, + remote_offsets, lengths, counter_index, counter_inc_value, counter_remote_address, flags, comp); } @@ -389,8 +397,7 @@ UCS_F_DEVICE uint64_t ucp_device_counter_read(const void *counter_ptr) template UCS_F_DEVICE void ucp_device_counter_write(void *counter_ptr, uint64_t value) { - return ucs_device_atomic64_write( - reinterpret_cast(counter_ptr), value); + ucs_device_atomic64_write(reinterpret_cast(counter_ptr), value); } @@ -402,8 +409,8 @@ UCS_F_DEVICE void ucp_device_counter_write(void *counter_ptr, uint64_t value) * batch of one or many operations in progress. * * @tparam level Level of cooperation of the transfer. - * @param [in] req Request containing operations in progress. - * + * @param [in] req Request containing operations in progress and channel to progress. + * * @return UCS_OK - The request has completed, no more operations are * in progress. * @return UCS_INPROGRESS - One or more operations in the request batch diff --git a/src/ucp/api/device/ucp_device_types.h b/src/ucp/api/device/ucp_device_types.h index b62902d714b..ebf06846c71 100644 --- a/src/ucp/api/device/ucp_device_types.h +++ b/src/ucp/api/device/ucp_device_types.h @@ -61,6 +61,27 @@ typedef struct ucp_device_mem_list_handle { uint16_t uct_mem_element_size[UCP_DEVICE_MEM_LIST_MAX_EPS]; /** + * Array of local addresses for the device transfer operations. + */ + void **local_addrs; + + /** + * Array of remote addresses for the device transfer operations. + */ + uint64_t *remote_addrs; + + /** + * Array of lengths of the local buffers in bytes. + */ + size_t *lengths; + + /** + * Array of UCT memory element objects. + */ + void *uct_mem_elements; + + /** + * local address, remote address, and length arrays, are allocated contiguously. * For each @ref num_uct_eps UCT endpoints, a list of @ref * uct_device_mem_element objects. */ diff --git a/src/ucp/api/device/ucp_host.h b/src/ucp/api/device/ucp_host.h index 21f23ad590e..72bc0fc8d2f 100644 --- a/src/ucp/api/device/ucp_host.h +++ b/src/ucp/api/device/ucp_host.h @@ -37,7 +37,8 @@ enum ucp_device_mem_list_elem_field { UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH = UCS_BIT(0), /**< Source memory handle */ UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key */ UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR = UCS_BIT(2), /**< Local address */ - UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3) /**< Remote address */ + UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */ + UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH = UCS_BIT(4) /**< Length of the local buffer in bytes */ }; @@ -67,6 +68,11 @@ typedef struct ucp_device_mem_list_elem { */ void* local_addr; + /** + * Length of the local buffer in bytes. + */ + size_t length; + /** * Remote memory address for the device transfer operations. */ diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c index dffba8622b5..919c547b138 100644 --- a/src/ucp/core/ucp_device.c +++ b/src/ucp/core/ucp_device.c @@ -122,6 +122,7 @@ ucp_device_mem_list_params_check(const ucp_device_mem_list_params_t *params, /* TODO: Delegate most of checks below to proto selection */ if ((rkey == NULL) || (memh == NULL)) { + ucs_error("element[%lu] rkey=%p, memh=%p", i, rkey, memh); return UCS_ERR_INVALID_PARAM; } @@ -249,6 +250,15 @@ static ucs_status_t ucp_device_mem_list_create_handle( const ucp_device_mem_list_elem_t *ucp_element; ucp_md_index_t local_md_index; uint8_t rkey_index; + void **local_addresses; + uint64_t *remote_addresses; + size_t *lengths; + size_t length; + void *local_addr; + uint64_t remote_addr; + + handle_size += sizeof(*handle.local_addrs) + sizeof(*handle.remote_addrs) + + sizeof(*handle.lengths); /* For each available lane */ for (i = 0; @@ -314,8 +324,37 @@ static ucs_status_t ucp_device_mem_list_create_handle( return status; } + /* populate elements common parameters */ + local_addresses = (void**)UCS_PTR_BYTE_OFFSET(mem->address, sizeof(handle)); + remote_addresses = (uint64_t*) + UCS_PTR_BYTE_OFFSET(local_addresses, sizeof(*handle.local_addrs) * + params->num_elements); + lengths = (size_t*)UCS_PTR_BYTE_OFFSET(remote_addresses, + sizeof(*handle.remote_addrs) * + params->num_elements); + for (i = 0; i < params->num_elements; i++) { + ucp_element = ¶ms->elements[i]; + local_addr = UCS_PARAM_VALUE(UCP_DEVICE_MEM_LIST_ELEM_FIELD, + ucp_element, local_addr, LOCAL_ADDR, NULL); + remote_addr = UCS_PARAM_VALUE(UCP_DEVICE_MEM_LIST_ELEM_FIELD, + ucp_element, remote_addr, REMOTE_ADDR, 0); + length = UCS_PARAM_VALUE(UCP_DEVICE_MEM_LIST_ELEM_FIELD, ucp_element, + length, LENGTH, 0); + ucp_mem_type_unpack(ep->worker, &local_addresses[i], &local_addr, + sizeof(local_addresses[i]), mem_type); + ucp_mem_type_unpack(ep->worker, &remote_addresses[i], &remote_addr, + sizeof(remote_addresses[i]), mem_type); + ucp_mem_type_unpack(ep->worker, &lengths[i], &length, + sizeof(lengths[i]), mem_type); + } + + handle.local_addrs = local_addresses; + handle.remote_addrs = remote_addresses; + handle.lengths = lengths; + /* Populate element specific parameters */ - uct_element = UCS_PTR_TYPE_OFFSET(mem->address, ucs_typeof(handle)); + handle.uct_mem_elements = uct_element = UCS_PTR_BYTE_OFFSET( + lengths, sizeof(*handle.lengths) * params->num_elements); for (i = 0; i < num_uct_eps; i++) { local_md_index = ep_config->md_index[lanes[i]]; wiface = ucp_worker_iface(ep->worker, @@ -369,14 +408,14 @@ ucp_device_mem_list_create(ucp_ep_h ep, const ucp_device_mem_list_params_t *params, ucp_device_mem_list_handle_h *handle_p) { + ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_UNKNOWN; + ucp_worker_cfg_index_t rkey_cfg_index = UCP_WORKER_CFG_INDEX_NULL; ucp_lane_index_t lanes[UCP_DEVICE_MEM_LIST_MAX_EPS]; - ucp_worker_cfg_index_t rkey_cfg_index; ucs_status_t status; ucp_rkey_config_t *rkey_config; ucs_sys_device_t local_sys_dev, remote_sys_dev; ucp_md_map_t local_md_map, remote_md_map; ucp_ep_config_t *ep_config; - ucs_memory_type_t mem_type; uct_allocated_memory_t mem; if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) { @@ -392,9 +431,8 @@ ucp_device_mem_list_create(ucp_ep_h ep, } /* Perform pseudo lane selection without size */ - rkey_config = &ep->worker->rkey_config[rkey_cfg_index]; - ep_config = ucp_worker_ep_config(ep->worker, - rkey_config->key.ep_cfg_index); + rkey_config = &ep->worker->rkey_config[rkey_cfg_index]; + ep_config = ucp_worker_ep_config(ep->worker, rkey_config->key.ep_cfg_index); remote_sys_dev = rkey_config->key.sys_dev; remote_md_map = rkey_config->key.md_map; diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h index 4f014a03045..72aa0fa6f70 100644 --- a/src/uct/api/device/uct_device_impl.h +++ b/src/uct/api/device/uct_device_impl.h @@ -96,9 +96,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add( inc_value, remote_address, flags, comp); } else if (device_ep->uct_tl_id == UCT_DEVICE_TL_CUDA_IPC) { - return uct_cuda_ipc_ep_atomic_add(device_ep, mem_elem, - inc_value, remote_address, - flags, comp); + return uct_cuda_ipc_ep_atomic_add(device_ep, mem_elem, inc_value, + remote_address, flags, comp); } return UCS_ERR_UNSUPPORTED; @@ -162,8 +161,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi( mem_list_count, addresses, remote_addresses, lengths, counter_inc_value, - counter_remote_address, - flags, comp); + counter_remote_address, flags, + comp); } return UCS_ERR_UNSUPPORTED; @@ -207,6 +206,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi( * mem_list_indices. * @param [in] addresses Array of local addresses to send from. * @param [in] remote_addresses Array of remote addresses to send to. + * @param [in] local_offsets Array of local offsets to send from. + * @param [in] remote_offsets Array of remote offsets to send to. * @param [in] lengths Array of lengths in bytes for each send. * @param [in] counter_index Index of remote increment descriptor. * @param [in] counter_inc_value Value of the remote increment. @@ -221,6 +222,7 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial( uct_device_ep_h device_ep, const uct_device_mem_element_t *mem_list, const unsigned *mem_list_indices, unsigned mem_list_count, void *const *addresses, const uint64_t *remote_addresses, + const size_t *local_offsets, const size_t *remote_offsets, const size_t *lengths, unsigned counter_index, uint64_t counter_inc_value, uint64_t counter_remote_address, uint64_t flags, uct_device_completion_t *comp) @@ -228,15 +230,15 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial( if (device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA) { return uct_rc_mlx5_gda_ep_put_multi_partial( device_ep, mem_list, mem_list_indices, mem_list_count, - addresses, remote_addresses, lengths, counter_index, - counter_inc_value, counter_remote_address, flags, comp); + addresses, remote_addresses, local_offsets, remote_offsets, + lengths, counter_index, counter_inc_value, + counter_remote_address, flags, comp); } else if (device_ep->uct_tl_id == UCT_DEVICE_TL_CUDA_IPC) { - return uct_cuda_ipc_ep_put_multi_partial(device_ep, mem_list, - mem_list_indices, mem_list_count, - addresses, remote_addresses, - lengths, counter_index, - counter_inc_value, counter_remote_address, - flags, comp); + return uct_cuda_ipc_ep_put_multi_partial( + device_ep, mem_list, mem_list_indices, mem_list_count, + addresses, remote_addresses, local_offsets, remote_offsets, + lengths, counter_index, counter_inc_value, + counter_remote_address, flags, comp); } return UCS_ERR_UNSUPPORTED; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh index f3a9499e9cf..d9d390d5f9a 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc.cuh +++ b/src/uct/cuda/cuda_ipc/cuda_ipc.cuh @@ -308,13 +308,12 @@ uct_cuda_ipc_ep_put_single(uct_device_ep_h device_ep, } template -UCS_F_DEVICE ucs_status_t -uct_cuda_ipc_ep_put_multi(uct_device_ep_h device_ep, - const uct_device_mem_element_t *mem_list, - unsigned mem_list_count, void *const *addresses, - const uint64_t *remote_addresses, const size_t *lengths, - uint64_t counter_inc_value, uint64_t counter_remote_address, - uint64_t flags, uct_device_completion_t *comp) +UCS_F_DEVICE ucs_status_t uct_cuda_ipc_ep_put_multi( + uct_device_ep_h device_ep, const uct_device_mem_element_t *mem_list, + unsigned mem_list_count, void *const *addresses, + const uint64_t *remote_addresses, const size_t *lengths, + uint64_t counter_inc_value, uint64_t counter_remote_address, + uint64_t flags, uct_device_completion_t *comp) { unsigned int num_put_ops = (counter_remote_address != 0) ? mem_list_count - 1 : mem_list_count; unsigned int lane_id, num_lanes; @@ -323,8 +322,8 @@ uct_cuda_ipc_ep_put_multi(uct_device_ep_h device_ep, for (int i = 0; i < num_put_ops; i++) { auto cuda_ipc_mem_element = reinterpret_cast( UCS_PTR_BYTE_OFFSET(mem_list, sizeof(uct_cuda_ipc_device_mem_element_t) * i)); - auto mapped_rem_addr = uct_cuda_ipc_map_remote(cuda_ipc_mem_element, - remote_addresses[i]); + auto mapped_rem_addr = uct_cuda_ipc_map_remote( + cuda_ipc_mem_element, remote_addresses[i]); uct_cuda_ipc_copy_level(mapped_rem_addr, addresses[i], lengths[i]); } @@ -341,31 +340,44 @@ uct_cuda_ipc_ep_put_multi(uct_device_ep_h device_ep, } template -UCS_F_DEVICE ucs_status_t -uct_cuda_ipc_ep_put_multi_partial(uct_device_ep_h device_ep, - const uct_device_mem_element_t *mem_list, - const unsigned *mem_list_indices, unsigned mem_list_count, - void *const *addresses, const uint64_t *remote_addresses, - const size_t *lengths, unsigned counter_index, - uint64_t counter_inc_value, uint64_t counter_remote_address, - uint64_t flags, uct_device_completion_t *comp) +UCS_F_DEVICE ucs_status_t uct_cuda_ipc_ep_put_multi_partial( + uct_device_ep_h device_ep, const uct_device_mem_element_t *mem_list, + const unsigned *mem_list_indices, unsigned mem_list_count, + void *const *addresses, const uint64_t *remote_addresses, + const size_t *local_offsets, const size_t *remote_offsets, + const size_t *lengths, unsigned counter_index, + uint64_t counter_inc_value, uint64_t counter_remote_address, + uint64_t flags, uct_device_completion_t *comp) { unsigned int lane_id, num_lanes; uct_cuda_ipc_get_lane(lane_id, num_lanes); for (int i = 0; i < mem_list_count; i++) { - auto cuda_ipc_mem_element = reinterpret_cast( - UCS_PTR_BYTE_OFFSET(mem_list, sizeof(uct_cuda_ipc_device_mem_element_t) * mem_list_indices[i])); - auto mapped_rem_addr = uct_cuda_ipc_map_remote(cuda_ipc_mem_element, remote_addresses[i]); - uct_cuda_ipc_copy_level(mapped_rem_addr, addresses[i], lengths[i]); + unsigned idx = mem_list_indices[i]; + auto cuda_ipc_mem_element = + reinterpret_cast( + UCS_PTR_BYTE_OFFSET( + mem_list, + sizeof(uct_cuda_ipc_device_mem_element_t) * + idx)); + auto src_addr = UCS_PTR_BYTE_OFFSET(addresses[idx], local_offsets[i]); + auto mapped_rem_addr = uct_cuda_ipc_map_remote( + cuda_ipc_mem_element, + remote_addresses[idx] + remote_offsets[i]); + uct_cuda_ipc_copy_level(mapped_rem_addr, src_addr, lengths[i]); } - if ((counter_remote_address != 0) && (lane_id == 0)) { - auto cuda_ipc_mem_element = reinterpret_cast( - UCS_PTR_BYTE_OFFSET(mem_list, sizeof(uct_cuda_ipc_device_mem_element_t) * counter_index)); - auto mapped_counter_rem_addr = reinterpret_cast(uct_cuda_ipc_map_remote(cuda_ipc_mem_element, - counter_remote_address)); + if ((counter_inc_value != 0) && (lane_id == 0)) { + auto cuda_ipc_mem_element = + reinterpret_cast( + UCS_PTR_BYTE_OFFSET( + mem_list, + sizeof(uct_cuda_ipc_device_mem_element_t) * + counter_index)); + auto mapped_counter_rem_addr = reinterpret_cast( + uct_cuda_ipc_map_remote(cuda_ipc_mem_element, + counter_remote_address)); uct_cuda_ipc_atomic_inc(mapped_counter_rem_addr, counter_inc_value); } diff --git a/src/uct/ib/mlx5/gdaki/gdaki.cuh b/src/uct/ib/mlx5/gdaki/gdaki.cuh index b439d31f0f1..b057bc3cf62 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.cuh +++ b/src/uct/ib/mlx5/gdaki/gdaki.cuh @@ -347,7 +347,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( return UCS_ERR_UNSUPPORTED; } - if (counter_remote_address == 0) { + if (counter_inc_value == 0) { count--; } @@ -411,6 +411,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( uct_device_ep_h tl_ep, const uct_device_mem_element_t *tl_mem_list, const unsigned *mem_list_indices, unsigned mem_list_count, void *const *addresses, const uint64_t *remote_addresses, + const size_t *local_offsets, const size_t *remote_offsets, const size_t *lengths, unsigned counter_index, uint64_t counter_inc_value, uint64_t counter_remote_address, uint64_t flags, uct_device_completion_t *tl_comp) @@ -440,7 +441,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( return UCS_ERR_UNSUPPORTED; } - if (counter_remote_address != 0) { + if (counter_inc_value != 0) { count++; } @@ -462,10 +463,10 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( length = 8; opcode = MLX5_OPCODE_ATOMIC_FA; } else if (i < mem_list_count) { - idx = mem_list_indices[i]; - address = addresses[i]; - lkey = mem_list[idx].lkey; - remote_address = remote_addresses[i]; + idx = mem_list_indices[i]; + address = UCS_PTR_BYTE_OFFSET(addresses[idx], local_offsets[i]); + lkey = mem_list[idx].lkey; + remote_address = remote_addresses[idx] + remote_offsets[i]; length = lengths[i]; opcode = MLX5_OPCODE_RDMA_WRITE; } else { diff --git a/test/gtest/ucp/cuda/test_kernels.cu b/test/gtest/ucp/cuda/test_kernels.cu index 0f066031d77..11419a97e9b 100644 --- a/test/gtest/ucp/cuda/test_kernels.cu +++ b/test/gtest/ucp/cuda/test_kernels.cu @@ -21,32 +21,28 @@ ucp_test_kernel_do_operation(const test_ucp_device_kernel_params_t ¶ms, switch (params.operation) { case TEST_UCP_DEVICE_KERNEL_PUT_SINGLE: status = ucp_device_put_single(params.mem_list, - params.single.mem_list_index, - params.single.address, - params.single.remote_address, - params.single.length, flags, + params.single.mem_list_index, 0, + 0, params.single.length, 0, flags, req_ptr); break; case TEST_UCP_DEVICE_KERNEL_PUT_MULTI: - status = ucp_device_put_multi( - params.mem_list, params.multi.addresses, - params.multi.remote_addresses, params.multi.lengths, - params.multi.counter_inc_value, - params.multi.counter_remote_address, flags, req_ptr); + status = ucp_device_put_multi(params.mem_list, + params.multi.counter_inc_value, 0, + flags, req_ptr); break; case TEST_UCP_DEVICE_KERNEL_PUT_MULTI_PARTIAL: status = ucp_device_put_multi_partial( params.mem_list, params.partial.mem_list_indices, - params.partial.mem_list_count, params.partial.addresses, - params.partial.remote_addresses, params.partial.lengths, + params.partial.mem_list_count, + (size_t*)params.partial.local_offsets, + (size_t*)params.partial.remote_offsets, params.partial.lengths, params.partial.counter_index, params.partial.counter_inc_value, - params.partial.counter_remote_address, flags, req_ptr); + params.partial.counter_remote_offset, 0, flags, req_ptr); break; case TEST_UCP_DEVICE_KERNEL_COUNTER_INC: status = ucp_device_counter_inc( params.mem_list, params.counter_inc.mem_list_index, - params.counter_inc.inc_value, params.counter_inc.remote_address, - flags, req_ptr); + params.counter_inc.inc_value, 0, 0, flags, req_ptr); break; case TEST_UCP_DEVICE_KERNEL_COUNTER_WRITE: ucp_device_counter_write(params.local_counter.address, diff --git a/test/gtest/ucp/cuda/test_kernels.h b/test/gtest/ucp/cuda/test_kernels.h index 0f929f043c2..b15b3e80448 100644 --- a/test/gtest/ucp/cuda/test_kernels.h +++ b/test/gtest/ucp/cuda/test_kernels.h @@ -50,12 +50,12 @@ typedef struct { struct { const unsigned *mem_list_indices; unsigned mem_list_count; - void *const *addresses; - const uint64_t *remote_addresses; + size_t *local_offsets; + size_t *remote_offsets; const size_t *lengths; unsigned counter_index; uint64_t counter_inc_value; - uint64_t counter_remote_address; + uint64_t counter_remote_offset; } partial; struct { void *address; diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc index 06d2fb8a420..15906e020b8 100644 --- a/test/gtest/ucp/test_ucp_device.cc +++ b/test/gtest/ucp/test_ucp_device.cc @@ -101,11 +101,17 @@ test_ucp_device::mem_list::mem_list(entity &sender, entity &receiver, // Initialize elements std::vector elems(count); for (auto i = 0; i < count; ++i) { - auto &elem = elems[i]; - elem.field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | - UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY; - elem.memh = m_src[i]->memh(); - elem.rkey = m_rkeys[i]; + auto &elem = elems[i]; + elem.field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR | + UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH; + elem.memh = m_src[i]->memh(); + elem.rkey = m_rkeys[i]; + elem.local_addr = m_src[i]->ptr(); + elem.remote_addr = reinterpret_cast(m_dst[i]->ptr()); + elem.length = m_src[i]->size(); } // Initialize parameters @@ -465,18 +471,11 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi) const unsigned counter_index = count; list.dst_counter_init(counter_index); - auto addresses = ucx_cuda::make_device_vector(list.src_ptrs()); - auto remote_addresses = ucx_cuda::make_device_vector(list.dst_ptrs()); - auto lengths = ucx_cuda::make_device_vector(std::vector(count, size)); - auto params = init_params(); - params.operation = TEST_UCP_DEVICE_KERNEL_PUT_MULTI; - - params.mem_list = list.handle(); - params.multi.addresses = addresses.ptr(); - params.multi.remote_addresses = remote_addresses.ptr(); - params.multi.lengths = lengths.ptr(); - params.multi.counter_remote_address = list.dst_ptr(counter_index); - params.multi.counter_inc_value = 1; + auto params = init_params(); + params.operation = TEST_UCP_DEVICE_KERNEL_PUT_MULTI; + + params.mem_list = list.handle(); + params.multi.counter_inc_value = 1; launch_kernel(params); // Check received data @@ -504,30 +503,26 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi_partial) } } - std::vector addresses_vec; - std::vector remote_addresses_vec; - for (auto index : indexes_vec) { - addresses_vec.push_back(list.src_ptr(index)); - remote_addresses_vec.push_back(list.dst_ptr(index)); - } + std::vector local_offsets(indexes_vec.size(), 0); + std::vector remote_offsets(indexes_vec.size(), 0); - auto indexes = ucx_cuda::make_device_vector(indexes_vec); - auto addresses = ucx_cuda::make_device_vector(addresses_vec); - auto remote_addresses = ucx_cuda::make_device_vector(remote_addresses_vec); - auto lengths = ucx_cuda::make_device_vector( + auto indexes = ucx_cuda::make_device_vector(indexes_vec); + auto device_local_offsets = ucx_cuda::make_device_vector(local_offsets); + auto device_remote_offsets = ucx_cuda::make_device_vector(remote_offsets); + auto lengths = ucx_cuda::make_device_vector( std::vector(indexes_vec.size(), size)); - auto params = init_params(); - params.operation = TEST_UCP_DEVICE_KERNEL_PUT_MULTI_PARTIAL; - - params.mem_list = list.handle(); - params.partial.addresses = addresses.ptr(); - params.partial.remote_addresses = remote_addresses.ptr(); - params.partial.lengths = lengths.ptr(); - params.partial.mem_list_indices = indexes.ptr(); - params.partial.mem_list_count = indexes_vec.size(); - params.partial.counter_index = counter_index; - params.partial.counter_remote_address = list.dst_ptr(counter_index); - params.partial.counter_inc_value = 1; + auto params = init_params(); + params.operation = TEST_UCP_DEVICE_KERNEL_PUT_MULTI_PARTIAL; + + params.mem_list = list.handle(); + params.partial.local_offsets = device_local_offsets.ptr(); + params.partial.remote_offsets = device_remote_offsets.ptr(); + params.partial.lengths = lengths.ptr(); + params.partial.mem_list_indices = indexes.ptr(); + params.partial.mem_list_count = indexes_vec.size(); + params.partial.counter_index = counter_index; + params.partial.counter_remote_offset = 0; + params.partial.counter_inc_value = 1; launch_kernel(params); // Check received data diff --git a/test/gtest/uct/cuda/test_cuda_ipc_device.cc b/test/gtest/uct/cuda/test_cuda_ipc_device.cc index be1cd4575eb..500bae565f6 100644 --- a/test/gtest/uct/cuda/test_cuda_ipc_device.cc +++ b/test/gtest/uct/cuda/test_cuda_ipc_device.cc @@ -9,6 +9,7 @@ #include #include "test_kernels_uct.h" #include +#include #include class test_cuda_ipc_rma : public uct_test { @@ -307,8 +308,9 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_device) mem_buffer::pattern_fill(addresses[i], base_length, SEED1, UCS_MEMORY_TYPE_CUDA); } - cuda_uct::launch_uct_put_multi(device_ep, mem_elem, iovcnt + 1, addresses_dev, - remote_addresses_dev, lengths_dev, 4, (uint64_t)signal.ptr(), + cuda_uct::launch_uct_put_multi(device_ep, mem_elem, iovcnt + 1, + addresses_dev, remote_addresses_dev, + lengths_dev, 4, (uint64_t)signal.ptr(), device_level, num_threads, num_blocks); for (int i = 0; i < iovcnt; i++) { @@ -327,21 +329,22 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_device) UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_partial_device) { - size_t mem_elem_size = get_mem_elem_size(); - ucs_device_level_t device_level = get_device_level(); - unsigned num_threads = get_num_threads(); - unsigned num_blocks = get_num_blocks(); - size_t offset = get_offset(); - const int iovcnt = 8; - size_t length = iovcnt * (base_length + offset); - uint64_t signal_val = 4; - int counter_index = 1; + size_t mem_elem_size = get_mem_elem_size(); + ucs_device_level_t device_level = get_device_level(); + unsigned num_threads = get_num_threads(); + unsigned num_blocks = get_num_blocks(); + size_t offset = get_offset(); + const int iovcnt = 8; + size_t length = iovcnt * (base_length + offset); + uint64_t signal_val = 4; + int counter_index = 1; + std::vector offsets(iovcnt, 0); uct_device_ep_h device_ep; - uct_device_mem_element_t *mem_elem; - uint64_t *remote_addresses_dev, remote_addresses[iovcnt]; + uct_device_mem_element_t *mem_elements; + uint64_t *remote_addresses_dev, remote_addresses[iovcnt + 1]; size_t *lengths_dev, lengths[iovcnt]; - void **addresses_dev, *addresses[iovcnt]; - unsigned *mem_list_indices_dev, mem_list_indices[iovcnt + 1]; + void **addresses_dev, *addresses[iovcnt + 1]; + unsigned *mem_list_indices_dev, mem_list_indices[iovcnt]; if (device_level == UCS_DEVICE_LEVEL_GRID) { GTEST_SKIP() << "Grid level is not supported"; @@ -353,71 +356,87 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_partial_device) mapped_buffer sendbuf(length, SEED1, *m_sender, 0, UCS_MEMORY_TYPE_CUDA); mapped_buffer recvbuf(length, SEED2, *m_receiver, 0, UCS_MEMORY_TYPE_CUDA); - mapped_buffer signal(sizeof(uint64_t), 0, *m_receiver, 0, UCS_MEMORY_TYPE_CUDA); + mapped_buffer signal(sizeof(uint64_t), 0, *m_receiver, 0, + UCS_MEMORY_TYPE_CUDA); ASSERT_UCS_OK(uct_ep_get_device_ep(m_sender->ep(0), &device_ep)); - ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&mem_elem, mem_elem_size * (iovcnt + 1))); - ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&remote_addresses_dev, iovcnt * sizeof(uint64_t))); - ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&lengths_dev, iovcnt * sizeof(size_t))); - ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&addresses_dev, iovcnt * sizeof(void *))); - ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&mem_list_indices_dev, iovcnt * sizeof(unsigned))); + ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr*)&mem_elements, + mem_elem_size * (iovcnt + 1))); + ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr*)&remote_addresses_dev, + (iovcnt + 1) * sizeof(uint64_t))); + ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr*)&addresses_dev, + (iovcnt + 1) * sizeof(void*))); + ASSERT_EQ(CUDA_SUCCESS, + cuMemAlloc((CUdeviceptr*)&lengths_dev, iovcnt * sizeof(size_t))); + ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr*)&mem_list_indices_dev, + iovcnt * sizeof(unsigned))); /* Fill indices and pack PUT entries */ - for (int i = 0; i < iovcnt; i++) { - unsigned idx = (i < counter_index) ? i : (i + 1); - mem_list_indices[i] = idx; - uct_device_mem_element_t *mem_elem_iov = - (uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_elem, - mem_elem_size * idx); - ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), sendbuf.memh(), - recvbuf.rkey(), mem_elem_iov)); + int idx = 0; + for (int i = 0; i < iovcnt + 1; i++) { + uct_device_mem_element_t *mem_elem = (uct_device_mem_element_t*) + UCS_PTR_BYTE_OFFSET(mem_elements, mem_elem_size * i); + if (i == counter_index) { + ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), nullptr, + signal.rkey(), mem_elem)); + addresses[i] = nullptr; + remote_addresses[i] = (uint64_t)signal.ptr(); + continue; + } + + ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), + sendbuf.memh(), recvbuf.rkey(), + mem_elem)); + size_t iov_offset = (base_length + offset) * idx; + addresses[i] = UCS_PTR_BYTE_OFFSET(sendbuf.ptr(), iov_offset); + remote_addresses[i] = (uint64_t)UCS_PTR_BYTE_OFFSET(recvbuf.ptr(), + iov_offset); + lengths[idx] = base_length; + mem_list_indices[idx] = i; + mem_buffer::pattern_fill(addresses[i], base_length, SEED1, + UCS_MEMORY_TYPE_CUDA); + idx++; } /* Pack counter entry directly at mem_list[counter_index] */ - uct_device_mem_element_t *mem_elem_counter = - (uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_elem, - mem_elem_size * counter_index); - ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), nullptr, - signal.rkey(), mem_elem_counter)); - for (int i = 0; i < iovcnt; i++) { - size_t iov_offset = (base_length + offset) * i; - addresses[i] = UCS_PTR_BYTE_OFFSET(sendbuf.ptr(), iov_offset); - remote_addresses[i] = (uint64_t)UCS_PTR_BYTE_OFFSET(recvbuf.ptr(), iov_offset); - lengths[i] = base_length; - } - - ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)remote_addresses_dev, remote_addresses, - iovcnt * sizeof(uint64_t))); + ASSERT_EQ(CUDA_SUCCESS, + cuMemcpyHtoD((CUdeviceptr)remote_addresses_dev, remote_addresses, + (iovcnt + 1) * sizeof(uint64_t))); + ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)addresses_dev, addresses, + (iovcnt + 1) * sizeof(void*))); ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)lengths_dev, lengths, iovcnt * sizeof(size_t))); - ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)addresses_dev, addresses, - iovcnt * sizeof(void*))); - ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)mem_list_indices_dev, mem_list_indices, - iovcnt * sizeof(unsigned))); - for (int i = 0; i < iovcnt; i++) { - mem_buffer::pattern_fill(addresses[i], base_length, SEED1, UCS_MEMORY_TYPE_CUDA); - } - - cuda_uct::launch_uct_put_multi_partial(device_ep, mem_elem, mem_list_indices_dev, - iovcnt, addresses_dev, - remote_addresses_dev, lengths_dev, - counter_index, signal_val, (uint64_t)signal.ptr(), - device_level, num_threads, num_blocks); - - for (int i = 0; i < iovcnt; i++) { - mem_buffer::pattern_check(UCS_PTR_BYTE_OFFSET(recvbuf.ptr(), (base_length + offset) * i), - base_length, SEED1, UCS_MEMORY_TYPE_CUDA); + ASSERT_EQ(CUDA_SUCCESS, + cuMemcpyHtoD((CUdeviceptr)mem_list_indices_dev, mem_list_indices, + iovcnt * sizeof(unsigned))); + auto offsets_dev = ucx_cuda::make_device_vector(offsets).ptr(); + + cuda_uct::launch_uct_put_multi_partial(device_ep, mem_elements, + mem_list_indices_dev, iovcnt, + addresses_dev, remote_addresses_dev, + offsets_dev, lengths_dev, + counter_index, signal_val, + (uint64_t)signal.ptr(), device_level, + num_threads, num_blocks); + for (int i = 0; i < iovcnt + 1; i++) { + if (i == counter_index) { + continue; + } + mem_buffer::pattern_check((void*)remote_addresses[i], base_length, + SEED1, UCS_MEMORY_TYPE_CUDA); } - ASSERT_EQ(mem_buffer::compare(&signal_val, signal.ptr(), - sizeof(signal_val), UCS_MEMORY_TYPE_CUDA), 1); + ASSERT_EQ(mem_buffer::compare(&signal_val, signal.ptr(), sizeof(signal_val), + UCS_MEMORY_TYPE_CUDA), + 1); - cuMemFree((CUdeviceptr)mem_elem); + cuMemFree((CUdeviceptr)mem_elements); cuMemFree((CUdeviceptr)remote_addresses_dev); cuMemFree((CUdeviceptr)lengths_dev); cuMemFree((CUdeviceptr)addresses_dev); + cuMemFree((CUdeviceptr)mem_list_indices_dev); } UCS_TEST_P(test_cuda_ipc_rma_device, atomic_add_device) diff --git a/test/gtest/uct/cuda/test_kernels.cu b/test/gtest/uct/cuda/test_kernels.cu index b44ee8efa52..06f584a1bfb 100644 --- a/test/gtest/uct/cuda/test_kernels.cu +++ b/test/gtest/uct/cuda/test_kernels.cu @@ -157,6 +157,7 @@ uct_put_partial_kernel(uct_device_ep_h ep, uct_device_mem_element_t *mem_list, __shared__ size_t sizes[iovcnt]; __shared__ void *src[iovcnt]; __shared__ uint64_t dst[iovcnt]; + __shared__ size_t offsets[iovcnt]; int lane_id = threadIdx.x; ucs_status_t status; @@ -165,12 +166,14 @@ uct_put_partial_kernel(uct_device_ep_h ep, uct_device_mem_element_t *mem_list, sizes[lane_id] = length / iovcnt; src[lane_id] = (void*)((uintptr_t)va + length / iovcnt * lane_id); dst[lane_id] = rva + length / iovcnt * lane_id; + offsets[lane_id] = 0; } __syncwarp(); status = uct_device_ep_put_multi_partial( - ep, mem_list, indices, iovcnt, src, dst, sizes, iovcnt, 4, - atomic_rva, UCT_DEVICE_FLAG_NODELAY, &comp); + ep, mem_list, indices, iovcnt, src, dst, (const size_t*)offsets, + (const size_t*)offsets, sizes, iovcnt, 4, atomic_rva, + UCT_DEVICE_FLAG_NODELAY, &comp); if (status != UCS_INPROGRESS) { *status_p = status; return; diff --git a/test/gtest/uct/cuda/test_kernels_uct.cu b/test/gtest/uct/cuda/test_kernels_uct.cu index 0e009f29489..7593a5d2343 100644 --- a/test/gtest/uct/cuda/test_kernels_uct.cu +++ b/test/gtest/uct/cuda/test_kernels_uct.cu @@ -95,7 +95,7 @@ template class device_result_ptr { return true; } return false; - } +} template static __global__ void @@ -224,56 +224,75 @@ ucs_status_t launch_uct_atomic(uct_device_ep_h device_ep, template static __global__ void uct_put_multi_kernel(uct_device_ep_h ep, - const uct_device_mem_element_t *mem_list, - size_t mem_list_count, void *const *addresses, - const uint64_t *remote_addresses, const size_t *lengths, - uint64_t counter_inc_value, uint64_t counter_remote_address, - ucs_status_t *status_p) + const uct_device_mem_element_t *mem_list, + size_t mem_list_count, void *const *addresses, + const uint64_t *remote_addresses, const size_t *lengths, + uint64_t counter_inc_value, + uint64_t counter_remote_address, ucs_status_t *status_p) { uct_device_completion_t comp; if (is_op_enabled(level)) { - *status_p = uct_device_ep_put_multi(ep, mem_list, mem_list_count, addresses, - remote_addresses, lengths, - counter_inc_value, counter_remote_address, - UCT_DEVICE_FLAG_NODELAY, &comp); + *status_p = uct_device_ep_put_multi(ep, mem_list, mem_list_count, + addresses, remote_addresses, + lengths, counter_inc_value, + counter_remote_address, + UCT_DEVICE_FLAG_NODELAY, + &comp); } } -ucs_status_t launch_uct_put_multi(uct_device_ep_h device_ep, - const uct_device_mem_element_t *mem_list, - size_t mem_list_count, void *const *addresses, - const uint64_t *remote_addresses, const size_t *lengths, - uint64_t counter_inc_value, uint64_t counter_remote_address, - ucs_device_level_t level, - unsigned num_threads, unsigned num_blocks) +ucs_status_t +launch_uct_put_multi(uct_device_ep_h device_ep, + const uct_device_mem_element_t *mem_list, + size_t mem_list_count, void *const *addresses, + const uint64_t *remote_addresses, const size_t *lengths, + uint64_t counter_inc_value, + uint64_t counter_remote_address, ucs_device_level_t level, + unsigned num_threads, unsigned num_blocks) { device_result_ptr status = UCS_ERR_NOT_IMPLEMENTED; cudaError_t st; switch (level) { - case UCS_DEVICE_LEVEL_THREAD: - uct_put_multi_kernel<<>>( - device_ep, mem_list, mem_list_count, addresses, remote_addresses, lengths, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - case UCS_DEVICE_LEVEL_WARP: - uct_put_multi_kernel<<>>( - device_ep, mem_list, mem_list_count, addresses, remote_addresses, lengths, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - case UCS_DEVICE_LEVEL_BLOCK: - uct_put_multi_kernel<<>>( - device_ep, mem_list, mem_list_count, addresses, remote_addresses, lengths, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - case UCS_DEVICE_LEVEL_GRID: - uct_put_multi_kernel<<>>( - device_ep, mem_list, mem_list_count, addresses, remote_addresses, lengths, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - default: - throw std::runtime_error("Unsupported level"); + case UCS_DEVICE_LEVEL_THREAD: + uct_put_multi_kernel + <<>>(device_ep, mem_list, + mem_list_count, addresses, + remote_addresses, lengths, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + case UCS_DEVICE_LEVEL_WARP: + uct_put_multi_kernel + <<>>(device_ep, mem_list, + mem_list_count, addresses, + remote_addresses, lengths, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + case UCS_DEVICE_LEVEL_BLOCK: + uct_put_multi_kernel + <<>>(device_ep, mem_list, + mem_list_count, addresses, + remote_addresses, lengths, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + case UCS_DEVICE_LEVEL_GRID: + uct_put_multi_kernel + <<>>(device_ep, mem_list, + mem_list_count, addresses, + remote_addresses, lengths, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + default: + throw std::runtime_error("Unsupported level"); } st = cudaGetLastError(); @@ -286,64 +305,79 @@ ucs_status_t launch_uct_put_multi(uct_device_ep_h device_ep, } template -static __global__ void -uct_put_multi_partial_kernel(uct_device_ep_h ep, - const uct_device_mem_element_t *mem_list, - const unsigned *mem_list_indices, unsigned mem_list_count, - void *const *addresses, const uint64_t *remote_addresses, - const size_t *lengths, unsigned counter_index, - uint64_t counter_inc_value, uint64_t counter_remote_address, - ucs_status_t *status_p) +static __global__ void uct_put_multi_partial_kernel( + uct_device_ep_h ep, const uct_device_mem_element_t *mem_list, + const unsigned *mem_list_indices, unsigned mem_list_count, + void *const *addresses, const uint64_t *remote_addresses, + const size_t *offsets, const size_t *lengths, unsigned counter_index, + uint64_t counter_inc_value, uint64_t counter_remote_address, + ucs_status_t *status_p) { uct_device_completion_t comp; if (is_op_enabled(level)) { - *status_p = uct_device_ep_put_multi_partial(ep, mem_list, mem_list_indices, mem_list_count, - addresses, remote_addresses, lengths, counter_index, - counter_inc_value, counter_remote_address, - UCT_DEVICE_FLAG_NODELAY, &comp); + *status_p = uct_device_ep_put_multi_partial( + ep, mem_list, mem_list_indices, mem_list_count, addresses, + remote_addresses, offsets, offsets, lengths, counter_index, + counter_inc_value, counter_remote_address, + UCT_DEVICE_FLAG_NODELAY, &comp); } } -ucs_status_t launch_uct_put_multi_partial(uct_device_ep_h device_ep, - const uct_device_mem_element_t *mem_list, - const unsigned *mem_list_indices, unsigned mem_list_count, - void *const *addresses, const uint64_t *remote_addresses, - const size_t *lengths, unsigned counter_index, - uint64_t counter_inc_value, uint64_t counter_remote_address, - ucs_device_level_t level, - unsigned num_threads, unsigned num_blocks) +ucs_status_t launch_uct_put_multi_partial( + uct_device_ep_h device_ep, const uct_device_mem_element_t *mem_list, + const unsigned *mem_list_indices, unsigned mem_list_count, + void *const *addresses, const uint64_t *remote_addresses, + const size_t *offsets, const size_t *lengths, unsigned counter_index, + uint64_t counter_inc_value, uint64_t counter_remote_address, + ucs_device_level_t level, unsigned num_threads, unsigned num_blocks) { device_result_ptr status = UCS_ERR_NOT_IMPLEMENTED; cudaError_t st; switch (level) { - case UCS_DEVICE_LEVEL_THREAD: - uct_put_multi_partial_kernel<<>>( - device_ep, mem_list, mem_list_indices, mem_list_count, addresses, - remote_addresses, lengths, counter_index, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - case UCS_DEVICE_LEVEL_WARP: - uct_put_multi_partial_kernel<<>>( - device_ep, mem_list, mem_list_indices, mem_list_count, addresses, - remote_addresses, lengths, counter_index, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - case UCS_DEVICE_LEVEL_BLOCK: - uct_put_multi_partial_kernel<<>>( - device_ep, mem_list, mem_list_indices, mem_list_count, addresses, - remote_addresses, lengths, counter_index, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - case UCS_DEVICE_LEVEL_GRID: - uct_put_multi_partial_kernel<<>>( - device_ep, mem_list, mem_list_indices, mem_list_count, addresses, - remote_addresses, lengths, counter_index, - counter_inc_value, counter_remote_address, status.device_ptr()); - break; - default: - throw std::runtime_error("Unsupported level"); + case UCS_DEVICE_LEVEL_THREAD: + uct_put_multi_partial_kernel + <<>>(device_ep, mem_list, + mem_list_indices, mem_list_count, + addresses, remote_addresses, + offsets, lengths, counter_index, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + case UCS_DEVICE_LEVEL_WARP: + uct_put_multi_partial_kernel + <<>>(device_ep, mem_list, + mem_list_indices, mem_list_count, + addresses, remote_addresses, + offsets, lengths, counter_index, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + case UCS_DEVICE_LEVEL_BLOCK: + uct_put_multi_partial_kernel + <<>>(device_ep, mem_list, + mem_list_indices, mem_list_count, + addresses, remote_addresses, + offsets, lengths, counter_index, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + case UCS_DEVICE_LEVEL_GRID: + uct_put_multi_partial_kernel + <<>>(device_ep, mem_list, + mem_list_indices, mem_list_count, + addresses, remote_addresses, + offsets, lengths, counter_index, + counter_inc_value, + counter_remote_address, + status.device_ptr()); + break; + default: + throw std::runtime_error("Unsupported level"); } st = cudaGetLastError(); diff --git a/test/gtest/uct/cuda/test_kernels_uct.h b/test/gtest/uct/cuda/test_kernels_uct.h index b902aa2d0fe..beaba9b03e7 100644 --- a/test/gtest/uct/cuda/test_kernels_uct.h +++ b/test/gtest/uct/cuda/test_kernels_uct.h @@ -29,22 +29,22 @@ ucs_status_t launch_uct_atomic(uct_device_ep_h device_ep, unsigned num_threads, unsigned num_blocks); -ucs_status_t launch_uct_put_multi(uct_device_ep_h device_ep, - const uct_device_mem_element_t *mem_list, - size_t mem_list_count, void *const *addresses, - const uint64_t *remote_addresses, const size_t *lengths, - uint64_t counter_inc_value, uint64_t counter_remote_address, - ucs_device_level_t level, - unsigned num_threads, unsigned num_blocks); - -ucs_status_t launch_uct_put_multi_partial(uct_device_ep_h device_ep, - const uct_device_mem_element_t *mem_list, - const unsigned *mem_list_indices, unsigned mem_list_count, - void *const *addresses, const uint64_t *remote_addresses, - const size_t *lengths, unsigned counter_index, - uint64_t counter_inc_value, uint64_t counter_remote_address, - ucs_device_level_t level, - unsigned num_threads, unsigned num_blocks); +ucs_status_t +launch_uct_put_multi(uct_device_ep_h device_ep, + const uct_device_mem_element_t *mem_list, + size_t mem_list_count, void *const *addresses, + const uint64_t *remote_addresses, const size_t *lengths, + uint64_t counter_inc_value, + uint64_t counter_remote_address, ucs_device_level_t level, + unsigned num_threads, unsigned num_blocks); + +ucs_status_t launch_uct_put_multi_partial( + uct_device_ep_h device_ep, const uct_device_mem_element_t *mem_list, + const unsigned *mem_list_indices, unsigned mem_list_count, + void *const *addresses, const uint64_t *remote_addresses, + const size_t *offsets, const size_t *lengths, unsigned counter_index, + uint64_t counter_inc_value, uint64_t counter_remote_address, + ucs_device_level_t level, unsigned num_threads, unsigned num_blocks); }; // namespace cuda_uct #endif