From 1f5ec3ee94da4301fd204ab92bc97ac0887ee446 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 19 Sep 2024 14:08:58 -0600 Subject: [PATCH 1/7] pml/ob1: fix potential double return of RDMA fragment on get operation failure The mca_pml_ob1_recv_request_get_frag_failed method is responsible for returning or queueing the fragment but mca_pml_ob1_rget_completion was freeing it unconditionally. This will lead to a double return of the fragment to the free list and may lead to other errors if the fragment was queued for retry. This commit fixes the issue by only returning the fragment if it did not fail. Signed-off-by: Nathan Hjelm (cherry picked from commit b7f8caea5477d60746ca116ee64bf9c6541314da) --- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index afbb8c7aad6..dc9d97fcf4a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -413,6 +413,7 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_ /* check completion status */ if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { status = mca_pml_ob1_recv_request_get_frag_failed (frag, status); + /* fragment was returned or queue by the above call */ if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { size_t skipped_bytes = recvreq->req_send_offset - recvreq->req_rdma_offset; opal_output_verbose(mca_pml_ob1_output, 1, "pml:ob1: %s: operation failed with code %d", __func__, status); @@ -435,12 +436,12 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_ mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc, bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag, frag->rdma_length, 0, 0); + + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); } recv_request_pml_complete_check(recvreq); - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } From 40c290e0680448b072f1f4cd99a9a3f9c3626476 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 19 Sep 2024 14:03:56 -0600 Subject: [PATCH 2/7] pml/ob1: fix double increment of the RDMA frag retry counter If a put or get operation fails it may later be retried by mca_pml_ob1_process_pending_rdma which increments retries on each new attempt. There is a flaw in the code where both the put and get failures also increment this counter leading to it giving up twice as fast. This commit removes the increments on the put and get failures. Signed-off-by: Nathan Hjelm (cherry picked from commit 27efeb9ebdcf9e8e4f36df7d52d1834e4346f654) --- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 2 +- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 37 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index dc9d97fcf4a..3ef618a0c79 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -382,7 +382,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr } } - if (++frag->retries < mca_pml_ob1.rdma_retries_limit && + if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 24336d2590d..f36cd477d8a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -1268,30 +1268,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { + if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { /* queue the frag for later if there was a resource error */ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - } else { + return; + } + #if OPAL_ENABLE_FT - if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { - return; - } -#endif /* OPAL_ENABLE_FT */ - /* tell receiver to deregister memory */ - mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, - frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, - OPAL_ERR_TEMP_OUT_OF_RESOURCE); - - /* send fragment by copy in/out */ - mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, - frag->rdma_length); - /* if a pointer to a receive request is not set it means that - * ACK was not yet received. Don't schedule sends before ACK */ - if (NULL != sendreq->req_recv.pval) - mca_pml_ob1_send_request_schedule (sendreq); + if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { + return; } +#endif /* OPAL_ENABLE_FT */ + /* tell receiver to deregister memory */ + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, + OPAL_ERR_TEMP_OUT_OF_RESOURCE); + + /* send fragment by copy in/out */ + mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, + frag->rdma_length); + /* if a pointer to a receive request is not set it means that + * ACK was not yet received. Don't schedule sends before ACK */ + if (NULL != sendreq->req_recv.pval) + mca_pml_ob1_send_request_schedule (sendreq); } /** From 25862a9e9886a1034a77a405b507b4d4368c1702 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 19 Sep 2024 13:58:53 -0600 Subject: [PATCH 3/7] pml/ob1: ensure RDMA fragments are released in the get -> send/recv fallback Under a number of circumstances it may be necessary to abandon an RDMA get in ob1. In some cases it falls back to put but it may fall back to using send/recv. If that happens then we may either crash or leak RDMA fragments because they are still attached to the send request. Debug builds will crash due to a check on rdma_frag when they are returned. This CL fixes the flaw by releasing any rdma fragment when sceduling sends. Signed-off-by: Nathan Hjelm (cherry picked from commit 020e83f3e0cc3bd8814b3693af2b1e298a634a3b) --- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index f36cd477d8a..0dd246917c0 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -22,6 +22,7 @@ * Copyright (c) 2018-2019 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2024 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1110,6 +1111,12 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq) range = get_send_range(sendreq); + if (NULL != sendreq->rdma_frag) { + /* this request was first attempted with RDMA but is now using send/recv */ + MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag); + sendreq->rdma_frag = NULL; + } + while(range && (false == sendreq->req_throttle_sends || sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) { mca_pml_ob1_frag_hdr_t* hdr; From 18abef585791c76a68ce5e44e4eaf1b98d7d72be Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 19 Sep 2024 16:09:26 -0600 Subject: [PATCH 4/7] btl/uct: fix fetching atomics support for osc/rdma The check that requires 32 and 64 bit atomic support is way outdated at this point. This commit takes the union of the two instead of the intersection because technically it is up to the btl user to determine which of each type are supported. This commit also ensures that MCA_BTL_ATOMIC_SUPPORTS_32BIT is set when 32-bit atomics are available and ensures that the required MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION flag is set on the btl. Signed-off-by: Nathan Hjelm (cherry picked from commit 8adf9f55a23f0d70b4ff9243856a5f3283b7b3e1) --- opal/mca/btl/uct/btl_uct_module.c | 3 ++- opal/mca/btl/uct/btl_uct_tl.c | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c index 6e80bab65f3..9577d615b92 100644 --- a/opal/mca/btl/uct/btl_uct_module.c +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -337,7 +337,8 @@ mca_btl_uct_module_t mca_btl_uct_module_template = { /* set the default flags for this btl. uct provides us with rdma and both * fetching and non-fetching atomics (though limited to add and cswap) */ - .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS, + .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS + | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT, diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index 2205508389e..5669e88c061 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -78,11 +78,10 @@ static void mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t *module, mc uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags; uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags; - /* NTH: don't really have a way to separate 32-bit and 64-bit right now */ - uint64_t all_flags = atomic_flags32 & atomic_flags64; - - module->super.btl_atomic_flags = 0; + uint64_t all_flags = atomic_flags64 | atomic_flags32; + module->super.btl_atomic_flags = (0 != atomic_flags32) ? MCA_BTL_ATOMIC_SUPPORTS_32BIT : 0; + if (cap_flags & UCT_IFACE_FLAG_ATOMIC_CPU) { module->super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; } From 3d05ccca8618b2707c5059df2f1714566e7b51e0 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 23 Sep 2024 20:59:14 -0700 Subject: [PATCH 5/7] Cleanup unused code. Signed-off-by: George Bosilca (cherry picked from commit 58400ade0ba46a45db6a33e215aff2cc32583c30) --- ompi/mca/pml/ob1/pml_ob1_isend.c | 2 +- opal/mca/btl/sm/btl_sm_send.c | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 8c9334764d7..b3e1741a239 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -143,7 +143,7 @@ static inline int mca_pml_ob1_send_inline (const void *buf, size_t count, } if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; + return rc; } return (int) size; diff --git a/opal/mca/btl/sm/btl_sm_send.c b/opal/mca/btl/sm/btl_sm_send.c index ca9aca0cf81..34ac9295d7b 100644 --- a/opal/mca/btl/sm/btl_sm_send.c +++ b/opal/mca/btl/sm/btl_sm_send.c @@ -73,18 +73,4 @@ int mca_btl_sm_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpo } return OPAL_SUCCESS; - -#if 0 - if (((frag->hdr->flags & MCA_BTL_SM_FLAG_SINGLE_COPY) || - !(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) && - frag->base.des_cbfunc) { - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - return OPAL_SUCCESS; - } - - /* data is gone (from the pml's perspective). frag callback/release will - happen later */ - return 1; -#endif } From 86893708c09602a7f9fea6a04eb1212cbdf9bf90 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 23 Sep 2024 20:58:19 -0700 Subject: [PATCH 6/7] Allow for packing less data than expected. Return the updated amount to allow the upper level to gracefully handle the case. Signed-off-by: George Bosilca (cherry picked from commit 27514c26e2acabbe6f055ff6f6c750336f68166d) --- opal/mca/btl/uct/btl_uct_am.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c index 312c85e83e5..1aae456842c 100644 --- a/opal/mca/btl/uct/btl_uct_am.c +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -51,7 +51,7 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl, } static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t header_size, - opal_convertor_t *convertor, size_t payload_size) + opal_convertor_t *convertor, size_t* payload_size) { uint32_t iov_count = 1; struct iovec iov; @@ -64,11 +64,9 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade /* pack the data into the supplied buffer */ iov.iov_base = (IOVBASE_TYPE *) ((intptr_t) data + header_size); - iov.iov_len = length = payload_size; + iov.iov_len = *payload_size; - (void) opal_convertor_pack(convertor, &iov, &iov_count, &length); - - assert(length == payload_size); + (void) opal_convertor_pack(convertor, &iov, &iov_count, payload_size); } struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t *btl, @@ -92,7 +90,10 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t } _mca_btl_uct_send_pack((void *) ((intptr_t) frag->uct_iov.buffer + reserve), NULL, 0, - convertor, *size); + convertor, size); + /* update the length of the fragment according to the convertor packed data */ + frag->segments[0].seg_len = reserve + *size; + frag->uct_iov.length = frag->segments[0].seg_len; } else { opal_convertor_get_current_pointer(convertor, &data_ptr); assert(NULL != data_ptr); @@ -286,7 +287,7 @@ static size_t mca_btl_uct_sendi_pack(void *data, void *arg) am_header->value = args->am_header; _mca_btl_uct_send_pack((void *) ((intptr_t) data + 8), args->header, args->header_size, - args->convertor, args->payload_size); + args->convertor, &args->payload_size); return args->header_size + args->payload_size + 8; } @@ -329,9 +330,18 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo } else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id) .cap.am.max_short) { int8_t *data = alloca(total_size); - _mca_btl_uct_send_pack(data, header, header_size, convertor, payload_size); - ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, data, - total_size); + size_t packed_payload_size = payload_size; + _mca_btl_uct_send_pack(data, header, header_size, convertor, &packed_payload_size); + if (packed_payload_size != payload_size) { + /* This should never happen as the packed data should go in a single pack. But + in case it does, fallback onto a descriptor allocation and let the caller + send the data. + */ + ucs_status = UCS_ERR_NO_RESOURCE; + } else { + ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, data, + total_size); + } } else { ssize_t size; From 15cd49763a96555537a0bcd957d97ae8d830f31c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 23 Sep 2024 20:57:19 -0700 Subject: [PATCH 7/7] Return faster if nothing to do. Signed-off-by: George Bosilca (cherry picked from commit f29109ac111ff999ed66d1d1e6814f42d0a37021) --- opal/datatype/opal_datatype_internal.h | 2 +- opal/datatype/opal_datatype_position.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index e403080a445..47f6e0107cf 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -539,7 +539,7 @@ struct opal_datatype_t; # define OPAL_DATATYPE_SAFEGUARD_POINTER(ACTPTR, LENGTH, INITPTR, PDATA, COUNT) \ { \ unsigned char *__lower_bound = (INITPTR), *__upper_bound; \ - assert(((LENGTH) != 0) && ((COUNT) != 0)); \ + assert( (COUNT) != 0 ); \ __lower_bound += (PDATA)->true_lb; \ __upper_bound = (INITPTR) + (PDATA)->true_ub + \ ((PDATA)->ub - (PDATA)->lb) * ((COUNT) -1); \ diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index b43bfd7e545..3d82d73c0c6 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -66,8 +66,8 @@ static inline void position_single_block(opal_convertor_t *CONVERTOR, unsigned c } /** - * Advance the convertors' position according. Update the pointer and the remaining space - * accordingly. + * Advance the convertors' position according to account for *COUNT elements. Update + * the pointer and the remaining space accordingly. */ static inline void position_predefined_data(opal_convertor_t *CONVERTOR, dt_elem_desc_t *ELEM, size_t *COUNT, unsigned char **POINTER, size_t *SPACE) @@ -82,7 +82,8 @@ static inline void position_predefined_data(opal_convertor_t *CONVERTOR, dt_elem if (cando_count > *(COUNT)) { cando_count = *(COUNT); - } + } else if( 0 == cando_count ) + return; if (1 == _elem->blocklen) { DO_DEBUG(opal_output(0,