diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index e67202868e8..57aba677a8a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -382,7 +382,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr } } - if (++frag->retries < mca_pml_ob1.rdma_retries_limit && + if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); @@ -413,6 +413,7 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_ /* check completion status */ if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { status = mca_pml_ob1_recv_request_get_frag_failed (frag, status); + /* fragment was returned or queue by the above call */ if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { size_t skipped_bytes = recvreq->req_send_offset - recvreq->req_rdma_offset; opal_output_verbose(mca_pml_ob1_output, 1, "pml:ob1: %s: operation failed with code %d", __func__, status); @@ -435,12 +436,12 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_ mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc, bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag, frag->rdma_length, 0, 0); + + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); } recv_request_pml_complete_check(recvreq); - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 24336d2590d..0dd246917c0 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -22,6 +22,7 @@ * Copyright (c) 2018-2019 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2024 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1110,6 +1111,12 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq) range = get_send_range(sendreq); + if (NULL != sendreq->rdma_frag) { + /* this request was first attempted with RDMA but is now using send/recv */ + MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag); + sendreq->rdma_frag = NULL; + } + while(range && (false == sendreq->req_throttle_sends || sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) { mca_pml_ob1_frag_hdr_t* hdr; @@ -1268,30 +1275,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { + if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { /* queue the frag for later if there was a resource error */ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - } else { + return; + } + #if OPAL_ENABLE_FT - if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { - return; - } -#endif /* OPAL_ENABLE_FT */ - /* tell receiver to deregister memory */ - mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, - frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, - OPAL_ERR_TEMP_OUT_OF_RESOURCE); - - /* send fragment by copy in/out */ - mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, - frag->rdma_length); - /* if a pointer to a receive request is not set it means that - * ACK was not yet received. Don't schedule sends before ACK */ - if (NULL != sendreq->req_recv.pval) - mca_pml_ob1_send_request_schedule (sendreq); + if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { + return; } +#endif /* OPAL_ENABLE_FT */ + /* tell receiver to deregister memory */ + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, + OPAL_ERR_TEMP_OUT_OF_RESOURCE); + + /* send fragment by copy in/out */ + mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, + frag->rdma_length); + /* if a pointer to a receive request is not set it means that + * ACK was not yet received. Don't schedule sends before ACK */ + if (NULL != sendreq->req_recv.pval) + mca_pml_ob1_send_request_schedule (sendreq); } /**