Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework the MPI connect/accept code #12586

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 16 additions & 26 deletions ompi/dpm/dpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
* reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -118,7 +119,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
{
int k, size, rsize, rank, rc, rportlen=0;
char **members = NULL, *nstring, *rport=NULL, *key, *pkey;
bool dense, isnew;
bool isnew;
opal_process_name_t pname;
opal_list_t ilist, mlist, rlist;
pmix_info_t info, tinfo;
Expand All @@ -132,7 +133,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
ompi_communicator_t *newcomp=MPI_COMM_NULL;
ompi_proc_t *proc;
ompi_group_t *group=comm->c_local_group;
ompi_proc_t **proc_list=NULL, **new_proc_list = NULL;
ompi_proc_t **new_proc_list = NULL;
int32_t i;
ompi_group_t *new_group_pointer;
ompi_dpm_proct_caddy_t *cd;
Expand Down Expand Up @@ -180,38 +181,27 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
opal_argv_append_nosize(&members, nstring);
free(nstring);
} else {
if (OMPI_GROUP_IS_DENSE(group)) {
proc_list = group->grp_proc_pointers;
dense = true;
} else {
proc_list = (ompi_proc_t**)calloc(group->grp_proc_count,
sizeof(ompi_proc_t *));
for (i=0 ; i<group->grp_proc_count ; i++) {
if (NULL == (proc_list[i] = ompi_group_peer_lookup(group,i))) {
for (i = 0; i < size; i++) {
if (OMPI_GROUP_IS_DENSE(group)) {
proc = group->grp_proc_pointers[i];
} else {
if( NULL == (proc = ompi_group_peer_lookup(group, i)) ) {
OMPI_ERROR_LOG(OMPI_ERR_NOT_FOUND);
rc = OMPI_ERR_NOT_FOUND;
free(proc_list);
opal_argv_free(members);
goto exit;
}
}
dense = false;
}
for (i=0; i < size; i++) {
opal_process_name_t proc_name;
if (ompi_proc_is_sentinel (proc_list[i])) {
proc_name = ompi_proc_sentinel_to_name ((uintptr_t) proc_list[i]);
if (ompi_proc_is_sentinel (proc)) {
pname = ompi_proc_sentinel_to_name ((uintptr_t)proc);
} else {
proc_name = proc_list[i]->super.proc_name;
pname = proc->super.proc_name;
}
OPAL_PMIX_CONVERT_NAME(&pxproc, &proc_name);
OPAL_PMIX_CONVERT_NAME(&pxproc, &pname);
OPAL_PMIX_CONVERT_PROCT_TO_STRING(&nstring, &pxproc);
opal_argv_append_nosize(&members, nstring);
free(nstring);
}
if (!dense) {
free(proc_list);
proc_list = NULL;
}
}

if (rank == root) {
Expand Down Expand Up @@ -379,7 +369,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
}
opal_argv_free(members);

/* convert the list of members to a pmix_proc_t array */
/* convert the list of all members to a pmix_proc_t array */
nprocs = opal_list_get_size(&mlist);
PMIX_PROC_CREATE(procs, nprocs);
n = 0;
Expand Down Expand Up @@ -449,7 +439,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
continue; /* not a proc from this jobid */

new_proc_list[i] = proc;
opal_list_remove_item(&ilist, (opal_list_item_t*)cd); // TODO: do we need to release cd ?
opal_list_remove_item(&ilist, (opal_list_item_t*)cd);
OBJ_RELEASE(cd);
/* ompi_proc_complete_init_single() initializes and optionally retrieves
* OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
Expand Down Expand Up @@ -489,7 +479,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
} while (!opal_list_is_empty(&ilist));

/* call add_procs on the new ones */
rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist)));
rc = MCA_PML_CALL(add_procs(new_proc_list, i));
free(new_proc_list);
new_proc_list = NULL;
if (OMPI_SUCCESS != rc) {
Expand Down
8 changes: 6 additions & 2 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2009 Oak Ridge National Laboratory
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013-2024 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -1499,12 +1499,16 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void *user)
}
}


/* lookup the corresponding process */
btl_proc = mca_btl_tcp_proc_lookup(&guid);
if (NULL == btl_proc) {
const char *peer = opal_fd_get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt", "server accept cannot find guid", true,
opal_process_info.nodename, getpid());
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_PROC_MY_HOSTNAME,
getpid(), OPAL_NAME_PRINT(guid), peer);
CLOSE_THE_SOCKET(sd);
free((char *) peer);
return;
}

Expand Down
5 changes: 4 additions & 1 deletion opal/mca/btl/tcp/help-mpi-btl-tcp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# reserved.
# Copyright (c) 2016 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -121,8 +122,10 @@ entry for that peer.
This attempted connection will be ignored; your MPI job may or may not
continue properly.

Local host: %s
Local guid: %s (on node %s)
PID: %d
Peer guid: %s
Peer IP: %s
#
[server getpeername failed]
WARNING: Open MPI failed to look up the peer IP address information of
Expand Down
Loading