From f2b984d47782a502e570b4f3b8b72d47ed1a967c Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Wed, 5 Apr 2017 15:20:06 -0700 Subject: [PATCH 01/11] Added forest --- src/cobo/cobo.c | 497 +++++++++++++++++- src/cobo/handshake.c | 10 +- src/cobo/ldcs_cobo.h | 10 +- src/logging/spindle_debug.h | 19 +- .../auditserver/ldcs_audit_server_filemngt.c | 13 +- .../auditserver/ldcs_audit_server_handlers.c | 62 ++- src/server/auditserver/ldcs_audit_server_md.h | 2 + .../auditserver/ldcs_audit_server_md_cobo.c | 352 +++++++++++-- .../auditserver/ldcs_audit_server_process.c | 82 +++ .../auditserver/ldcs_audit_server_process.h | 3 +- 10 files changed, 969 insertions(+), 81 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 730ac563..a2ef697f 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -55,6 +55,8 @@ Place, Suite 330, Boston, MA 02111-1307 USA #define COBO_CONNECT_TIMELIMIT (600) /* seconds -- wait this long before giving up for good */ #endif +#define ENABLE_HANDSHAKE + #if defined(_IA64_) #undef htons #undef ntohs @@ -62,6 +64,9 @@ Place, Suite 330, Boston, MA 02111-1307 USA #define ntohs(__bsx) ((((__bsx) >> 8) & 0xff) | (((__bsx) & 0xff) << 8)) #endif +#define err_printf cobo_dbg_printf + + /* * ========================================================================== * ========================================================================== @@ -109,6 +114,14 @@ static int cobo_num_child = 0; /* number of children */ static int* cobo_child_incl = NULL; /* number of children each child is responsible for (includes itself) */ static int cobo_num_child_incl = 0; /* total number of children this node is responsible for */ +/* forest data structures */ +static int cobo_num_forest_childs = 0; /* number of clockwise direction peers (children) */ +static int* cobo_forest_childs = NULL; /* ranks of clockwise direction peers**/ +static int* cobo_forest_childs_fd = NULL; /* sockets to clockwise direction peers */ +//static int cobo_num_forest_parents = 0; /* (= cobo_num_forest_childs) */ +static int* cobo_forest_parents = NULL; /* ranks of counterclockwise direction peers (parents) */ +static int* cobo_forest_parents_fd = NULL; /* sockets to counterclockwise direction peers */ + static int cobo_root_fd = -1; static handshake_protocol_t cobo_handshake; @@ -468,6 +481,8 @@ static int cobo_connect_hostname(char* hostname, int rank) saddr = *((struct in_addr *) (*he->h_addr_list)); } + + /* Loop until we make a connection or until our timeout expires. */ struct timeval start, end; cobo_gettimeofday(&start); @@ -484,11 +499,12 @@ static int cobo_connect_hostname(char* hostname, int rank) int port = cobo_ports[i]; /* attempt to connect to hostname on this port */ - debug_printf3("Trying rank %d port %d on %s\n", rank, port, hostname); + debug_printf3("Trying rank %d port %d on %s\n", rank, port, hostname); /* s = cobo_connect(*(struct in_addr *) (*he->h_addr_list), htons(port)); */ s = cobo_connect(saddr, htons(port), connect_timeout); if (s != -1) { /* got a connection, let's test it out */ + // cobo_dbg_printf("Connected to rank %d port %d on %s", rank, port, hostname); debug_printf3("Connected to rank %d port %d on %s\n", rank, port, hostname); int test_failed = 0; @@ -497,9 +513,19 @@ static int cobo_connect_hostname(char* hostname, int rank) case HSHAKE_SUCCESS: break; case HSHAKE_INTERNAL_ERROR: +#if 1 /* Kento modified*/ + /*Max listen process is 1, Between accept() and close(listen_sockfd), + another client process can connect to this port. Becase the server will call close(liste_sockfd) later, + this handshake faile with HSHAKE_INTERNAL_ERROR, so we try another port. + */ + debug_printf3("Internal error doing handshake: %s", spindle_handshake_last_error_str()); + close(s); + continue; +#else err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); exit(-1); break; +#endif case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); close(s); @@ -510,6 +536,7 @@ static int cobo_connect_hostname(char* hostname, int rank) default: assert(0 && "Unknown return value from handshake_server\n"); } + /* write cobo service id */ if (!test_failed && cobo_write_fd_w_suppress(s, &cobo_serviceid, sizeof(cobo_serviceid), 1) < 0) { @@ -745,11 +772,226 @@ static int cobo_compute_children_root_C1() } #endif +static int cobo_create_socket() +{ + /* create a socket to accept connection from parent IPPROTO_TCP */ + int sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) { + err_printf("Creating parent socket (socket() %m errno=%d)\n", + errno); + exit(1); + } + return sockfd; +} + +static void cobo_bind(int sockfd) +{ + /* TODO: could recycle over port numbers, trying to bind to one for some time */ + /* try to bind the socket to one the ports in our allowed range */ + int i = 0; + int port_is_bound = 0; + while (i < cobo_num_ports && !port_is_bound) { + /* pick a port */ + int port = cobo_ports[i]; + i++; + + /* set up an address using our selected port */ + struct sockaddr_in sin; + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + + /* attempt to bind a socket on this port */ + if (bind(sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + debug_printf3("Binding parent socket (bind() %m errno=%d) port=%d\n", + errno, port); + continue; + } + + /* bound and listening on our port */ + debug_printf3("Opened socket on port %d\n", port); + port_is_bound = 1; + } + + /* failed to bind to a port, this is fatal */ + if (!port_is_bound) { + /* TODO: would like to send an abort back to server */ + err_printf("Failed to open socket on any port\n"); + exit(1); + } + +} + +static void cobo_listen(int sockfd) +{ + /* set the socket to listen for connections */ + if (listen(sockfd, 1) < 0) { + cobo_dbg_printf("Setting parent socket to listen (listen() %m errno=%d)", errno); + exit(1); + } + return; +} + +/* bind and listen socket */ +static void cobo_bind_and_listen(int sockfd) +{ + /* TODO: could recycle over port numbers, trying to bind to one for some time */ + /* try to bind the socket to one the ports in our allowed range */ + int i = 0; + int port_is_bound = 0; + while (i < cobo_num_ports && !port_is_bound) { + /* pick a port */ + int port = cobo_ports[i]; + i++; + + /* set up an address using our selected port */ + struct sockaddr_in sin; + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + + /* attempt to bind a socket on this port */ + if (bind(sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + debug_printf3("Binding parent socket (bind() %m errno=%d) port=%d\n", + errno, port); + continue; + } + + /* set the socket to listen for connections */ + if (listen(sockfd, 1) < 0) { + debug_printf3("Setting parent socket to listen (listen() %m errno=%d) port=%d\n", + errno, port); + continue; + } + + /* bound and listening on our port */ + debug_printf3("Opened socket on port %d\n", port); + port_is_bound = 1; + } + + /* failed to bind to a port, this is fatal */ + if (!port_is_bound) { + /* TODO: would like to send an abort back to server */ + err_printf("Failed to open socket on any port\n"); + exit(1); + } +} + +static int cobo_accept_and_handshake(int sockfd) +{ + int accepted_sockfd; + /* accept a connection from parent and receive socket table */ + int reply_timeout = cobo_connect_timeout * 100; + int have_parent = 0; + while (!have_parent) { + struct sockaddr parent_addr; + socklen_t parent_len = sizeof(parent_addr); + accepted_sockfd = accept(sockfd, (struct sockaddr *) &parent_addr, &parent_len); + _cobo_opt_socket(sockfd); + + /* handshake/authenticate our connection to make sure it one of our processes */ + int result = spindle_handshake_server(accepted_sockfd, &cobo_handshake, cobo_sessionid); + switch (result) { + case HSHAKE_SUCCESS: + break; + case HSHAKE_INTERNAL_ERROR: + err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); + exit(-1); + break; + case HSHAKE_DROP_CONNECTION: + debug_printf3("Handshake said to drop connection\n"); + close(accepted_sockfd); + continue; + case HSHAKE_ABORT: + handle_security_error(spindle_handshake_last_error_str()); + abort(); + default: + assert(0 && "Unknown return value from handshake_server\n"); + } + + /* read the service id */ + unsigned int received_serviceid = 0; + if (cobo_read_fd_w_timeout(accepted_sockfd, &received_serviceid, sizeof(received_serviceid), reply_timeout) < 0) { + debug_printf3("Receiving service id from new connection failed\n"); + close(accepted_sockfd); + continue; + } + + /* read the session id */ + uint64_t received_sessionid = 0; + if (cobo_read_fd_w_timeout(accepted_sockfd, &received_sessionid, sizeof(received_sessionid), reply_timeout) < 0) { + debug_printf3("Receiving session id from new connection failed\n"); + close(accepted_sockfd); + continue; + } + + /* check that we got the expected sesrive and session ids */ + /* TODO: reply with some sort of error message if no match? */ + if (received_serviceid != cobo_serviceid || received_sessionid != cobo_sessionid) { + close(accepted_sockfd); + continue; + } + + /* write our service id back as a reply */ + if (cobo_write_fd_w_suppress(accepted_sockfd, &cobo_serviceid, sizeof(cobo_serviceid), 1) < 0) { + debug_printf3("Writing service id to new connection failed\n"); + close(accepted_sockfd); + continue; + } + + /* write our accept id back as a reply */ + if (cobo_write_fd_w_suppress(accepted_sockfd, &cobo_acceptid, sizeof(cobo_acceptid), 1) < 0) { + debug_printf3("Writing accept id to new connection failed\n"); + close(accepted_sockfd); + continue; + } + + /* our parent may have dropped us if he was too impatient waiting for our reply, + * read his ack to know that he completed the connection */ + unsigned int ack = 0; + if (cobo_read_fd_w_timeout(accepted_sockfd, &ack, sizeof(ack), reply_timeout) < 0) { + debug_printf3("Receiving ack to finalize connection\n"); + close(accepted_sockfd); + continue; + } + + /* if we get here, we've got a good connection to our parent */ + have_parent = 1; + } + // cobo_dbg_printf("handhsare done (rank: %d)", cobo_me); + return accepted_sockfd; +} + +static int cobo_connect_rank(int rank) +{ + char* hostname = cobo_expand_hostname(rank); + int sockfd = cobo_connect_hostname(hostname, rank); + if (sockfd == -1) { + err_printf("Failed to connect to child (rank %d) on %s failed\n", + rank, hostname); + exit(1); + } + free(hostname); + return sockfd; +} + + /* open socket tree across tasks */ static int cobo_open_tree() { + int sockfd; + int i = 0; +#if 1 + sockfd = cobo_create_socket(); + cobo_bind_and_listen(sockfd); + cobo_parent_fd = cobo_accept_and_handshake(sockfd); + // sleep(1); + close(sockfd); +#else /* create a socket to accept connection from parent IPPROTO_TCP */ - int sockfd = socket(AF_INET, SOCK_STREAM, 0); + sockfd = socket(AF_INET, SOCK_STREAM, 0); if (sockfd < 0) { err_printf("Creating parent socket (socket() %m errno=%d)\n", errno); @@ -758,7 +1000,7 @@ static int cobo_open_tree() /* TODO: could recycle over port numbers, trying to bind to one for some time */ /* try to bind the socket to one the ports in our allowed range */ - int i = 0; + int port_is_bound = 0; while (i < cobo_num_ports && !port_is_bound) { /* pick a port */ @@ -774,6 +1016,7 @@ static int cobo_open_tree() /* attempt to bind a socket on this port */ if (bind(sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + debug_printf3("Binding parent socket (bind() %m errno=%d) port=%d\n", errno, port); continue; @@ -798,6 +1041,7 @@ static int cobo_open_tree() exit(1); } + /* accept a connection from parent and receive socket table */ int reply_timeout = cobo_connect_timeout * 100; int have_parent = 0; @@ -880,6 +1124,7 @@ static int cobo_open_tree() /* we've got the connection to our parent, so close the listening socket */ close(sockfd); +#endif cobo_gettimeofday(&tree_start); @@ -930,8 +1175,8 @@ static int cobo_open_tree() int c = cobo_child[i]; char* child_hostname = cobo_expand_hostname(c); - debug_printf3("%d: on COBO%02d: connect to child #%02d (%s)\n",i,cobo_me,c,child_hostname); + debug_printf3("%d: on COBO%02d: connect to child #%02d (%s)\n",i,cobo_me,c,child_hostname); /* connect to child */ cobo_child_fd[i] = cobo_connect_hostname(child_hostname, c); if (cobo_child_fd[i] == -1) { @@ -951,6 +1196,7 @@ static int cobo_open_tree() /* free the child hostname string */ free(child_hostname); + } return COBO_SUCCESS; @@ -980,6 +1226,98 @@ static int cobo_close_tree() return COBO_SUCCESS; } +/* open socket forest */ +static int cobo_open_forest() +{ + + int i; + /* read our rank number */ + /* if (cobo_me == 0) { */ + /* int i; */ + /* cobo_dbg_printf("cobo_me: %d, cobo_nprocs: %d, cobo_hostlist_size: %d", cobo_me, cobo_nprocs, cobo_hostlist_size); */ + /* cobo_dbg_printf("cobo_hostlist:"); */ + /* for (i = 0; i < cobo_nprocs; i++) { */ + /* cobo_dbg_printf(" %s", cobo_expand_hostname(i)); */ + /* } */ + /* } */ + + /* compute ranks of peers */ + { + int rank_offset = 1; + while (rank_offset <= cobo_nprocs - 1) { + cobo_num_forest_childs++; + rank_offset = rank_offset << 1; + } + + + i = 0; + rank_offset = 1; + cobo_forest_childs = (int*)cobo_malloc(sizeof(int) * cobo_num_forest_childs, "clockwise peer buffer"); + cobo_forest_parents = (int*)cobo_malloc(sizeof(int) * cobo_num_forest_childs, "counterclockwise peer buffer"); + while (rank_offset <= cobo_nprocs - 1) { + cobo_forest_childs[i] = cobo_me + rank_offset; + cobo_forest_parents[i] = (cobo_me + cobo_nprocs - rank_offset) % cobo_nprocs; + rank_offset = rank_offset << 1; + i++; + } + + /* if (cobo_me == 12) { */ + /* for (i = 0; i < cobo_num_forest_childs; i++) { */ + /* cobo_dbg_printf("-> %d", cobo_forest_childs[i]); */ + /* } */ + /* cobo_dbg_printf("======"); */ + /* for (i = 0; i < cobo_num_forest_childs; i++) { */ + /* cobo_dbg_printf("-> %d", cobo_forest_parents[i]); */ + /* } */ + /* } */ + } + + /* Create Binominal Forest overlay network */ + int sockfd; + int num_successive_listen_rank_num = 1; + cobo_forest_childs_fd = (int*)cobo_malloc(sizeof(int) * cobo_num_forest_childs, "clockwise peer fd buffer"); + cobo_forest_parents_fd = (int*)cobo_malloc(sizeof(int) * cobo_num_forest_childs, "counterclockwise peer fd buffer"); + + sockfd = cobo_create_socket(); + cobo_bind(sockfd); + for (i = 0; i < cobo_num_forest_childs; i++) { + int connection_group_id = cobo_me / num_successive_listen_rank_num; + int conn_rank = cobo_forest_parents[i]; + /* TODO: Reuse alread binded socket in every iterations + => For now: Create and close sockets every iterations + */ + /* TODO: Reuse sockets opened in cobo_open_tree() + => For now: Sockets (connections) redundancy in tree and forest + */ + + if (connection_group_id % 2 == 0) { + /* Active connection => Passive connection */ + // cobo_dbg_printf("%d: connect to %d (Hop: %d)", cobo_me, conn_rank, num_successive_listen_rank_num); + cobo_forest_childs_fd[i] = cobo_connect_rank(conn_rank); + // cobo_dbg_printf("%d: listen (Hop: %d)", cobo_me, num_successive_listen_rank_num); + cobo_listen(sockfd); + // cobo_dbg_printf("%d: accept (Hop: %d)", cobo_me, num_successive_listen_rank_num); + cobo_forest_parents_fd[i] = cobo_accept_and_handshake(sockfd); + } else { + /* Passive connection -> Active connection */ + // cobo_dbg_printf("%d: listen (Hop: %d)", cobo_me, num_successive_listen_rank_num); + cobo_listen(sockfd); + // cobo_dbg_printf("%d: accept (Hop: %d)", cobo_me, num_successive_listen_rank_num); + cobo_forest_parents_fd[i] = cobo_accept_and_handshake(sockfd); + // cobo_dbg_printf("%d: connect to %d (Hop: %d)", cobo_me, conn_rank, num_successive_listen_rank_num); + cobo_forest_childs_fd[i] = cobo_connect_rank(conn_rank); + } + num_successive_listen_rank_num = num_successive_listen_rank_num << 1; + } + close(sockfd); + // cobo_dbg_printf("============= %d", cobo_me); + // sleep(1000); + // cobo_dbg_printf("============= %d", cobo_me); + // exit(0); + + return COBO_SUCCESS; +} + /* * ============================= * Functions to bcast/gather/scatter with root as rank 0 using the TCP/socket tree. @@ -1148,12 +1486,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) return rc; } -int cobo_get_child_socket(int num, int *fd) -{ - assert(num < cobo_num_child); - *fd = cobo_child_fd[num]; - return COBO_SUCCESS; -} + /* * ========================================================================== @@ -1163,11 +1496,120 @@ int cobo_get_child_socket(int num, int *fd) * ========================================================================== */ -/* NEW */ +int cobo_get_num_tree(int *num_trees) +{ + *num_trees = cobo_nprocs; + return COBO_SUCCESS; +} + +int cobo_get_forest_child_socket(int root, int num, int *fd) +{ + int num_forest_childs; + if (root == COBO_FOREST) { + *fd = cobo_forest_childs_fd[num]; + return COBO_SUCCESS; + } + + cobo_get_num_forest_childs(root, &num_forest_childs); + if (num >= num_forest_childs) { + cobo_dbg_printf("Requested child %d, but # of childs is %d", num, num_forest_childs); + exit(1); + } + *fd = cobo_forest_childs_fd[num]; + return COBO_SUCCESS; +} + +int cobo_get_num_forest_childs(int root, int* num_forest_childs) +{ + int num_childs = 0; + int logical_rank = 0; /*logical_rank of the root is 0*/ + int tmp_cobo_nprocs; + + if (root == COBO_FOREST) { + *num_forest_childs = cobo_num_forest_childs; + return COBO_SUCCESS; + } + + /*TODO: memoriaze already-computed num in an array, and simply return it*/ + /*Making the root's logical_rank 0 in this tree (root)*/ + logical_rank = (cobo_me + cobo_nprocs - root) % cobo_nprocs; + tmp_cobo_nprocs = cobo_nprocs >> 1; + // cobo_dbg_printf("logical_rank: %d, cobo_nprocs: %d", logical_rank, tmp_cobo_nprocs); + while(tmp_cobo_nprocs) { + if (logical_rank & 0x1) break; + // cobo_dbg_printf(" logical_rank: %d, cobo_nprocs: %d", logical_rank, tmp_cobo_nprocs); + num_childs++; + logical_rank = logical_rank >> 1; + tmp_cobo_nprocs = tmp_cobo_nprocs >> 1; + } + *num_forest_childs = num_childs; + return COBO_SUCCESS; +} + +int cobo_get_forest_parent_socket(int root, int *fd) +{ + int num_childs; + int forest_parents_fd_index; + if (cobo_me == root) { + if (cobo_me != 0) { + cobo_dbg_printf("root (tree_id=%d) does not have parent", root); + exit(1); + } + *fd = cobo_parent_fd; + // fprintf(stderr, "cobo_index: %d\n", -1); + } else { + cobo_get_num_forest_childs(root, &num_childs); + forest_parents_fd_index = num_childs; + *fd = cobo_forest_parents_fd[forest_parents_fd_index]; + // fprintf(stderr, "cobo_index: %d\n", forest_parents_fd_index); + } + return COBO_SUCCESS; +} + +int cobo_get_num_forest_parents(int root, int *num_parents) +{ + if (root == COBO_FOREST) { + *num_parents = cobo_num_forest_childs; + } else { + *num_parents = 1; + } + return COBO_SUCCESS; +} + +int cobo_get_forest_parent_socket_at(int num, int *fd) +{ + *fd = cobo_forest_parents_fd[num]; + return COBO_SUCCESS; +} + +#define COBO_FOREST + +#ifdef COBO_FOREST +int cobo_get_child_socket(int num, int *fd) +{ + cobo_get_forest_child_socket(0, num, fd); + return COBO_SUCCESS; +} +#else +int cobo_get_child_socket(int num, int *fd) +{ + assert(num < cobo_num_child); + *fd = cobo_child_fd[num]; + return COBO_SUCCESS; +} +#endif + +#ifdef COBO_FOREST +int cobo_get_num_childs(int* num_childs) { + cobo_get_num_forest_childs(0, num_childs); + return COBO_SUCCESS; +} +#else int cobo_get_num_childs(int* num_childs) { *num_childs=cobo_num_child; return COBO_SUCCESS; } +#endif @@ -1175,16 +1617,23 @@ int cobo_get_num_childs(int* num_childs) { /* TODO: the upside here is that the upper layer can directly use our * communication tree, but the downside is that it exposes the implementation * and forces sockets */ +#ifdef COBO_FOREST +int cobo_get_parent_socket(int* fd) +{ + cobo_get_forest_parent_socket(0, fd); +} +#else int cobo_get_parent_socket(int* fd) { if (cobo_parent_fd != -1) { *fd = cobo_parent_fd; return COBO_SUCCESS; } - - return -1; /* failure RCs? */ + return -1; /* failure RCs? */ } +#endif + /* Perform barrier, each task writes an int then waits for an int */ int cobo_barrier() { @@ -1321,6 +1770,15 @@ int cobo_alltoall(void* sendbuf, int sendcount, void* recvbuf) return rc; } +/* + * Perform MPI-like Allreduce maximum of a single int from each task + */ +static int cobo_allreduce() +{ + + +} + /* * Perform MPI-like Allreduce maximum of a single int from each task */ @@ -1394,6 +1852,7 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } + /* provide list of ports and number of ports as input, get number of tasks and my rank as output */ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* num_ranks) { @@ -1401,6 +1860,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* char *value; struct timeval start, end; + cobo_gettimeofday(&time_open); cobo_gettimeofday(&start); /* we now know this process is a client, although we don't know what our rank is yet */ @@ -1448,6 +1908,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* /* open the tree */ cobo_open_tree(); + /* need to check that tree opened successfully before returning, so do a barrier */ if (cobo_barrier() != COBO_SUCCESS) { err_printf("Failed to open tree\n"); @@ -1465,6 +1926,10 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* cobo_gettimeofday(&end); debug_printf3("Exiting cobo_init(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); + + /* open the forest */ + cobo_open_forest(); + return COBO_SUCCESS; } @@ -1474,7 +1939,6 @@ int cobo_close() struct timeval start, end; cobo_gettimeofday(&start); debug_printf3("Starting cobo_close()"); - /* shut down the tree */ cobo_close_tree(); @@ -1486,6 +1950,9 @@ int cobo_close() debug_printf3("Exiting cobo_close(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); debug_printf3("Total time from cobo_open() to cobo_close() took %f seconds for %d procs\n", cobo_getsecs(&time_close, &time_open), cobo_nprocs); + if (cobo_me == 0) { + cobo_dbg_printf("Total time: %f seconds (%d procs)", cobo_getsecs(&time_close, &time_open), cobo_nprocs); + } return COBO_SUCCESS; } @@ -1586,7 +2053,7 @@ int cobo_server_close() /* free data structures */ cobo_free(cobo_ports); cobo_free(cobo_hostlist); - + return COBO_SUCCESS; } diff --git a/src/cobo/handshake.c b/src/cobo/handshake.c index 382e9765..21129dc0 100644 --- a/src/cobo/handshake.c +++ b/src/cobo/handshake.c @@ -66,6 +66,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA if (debug_file) { \ fprintf(debug_file, "ERROR: [%s:%u] - %s", BASE_FILE, __LINE__, last_error_message); \ } \ + fprintf(stderr, "ERROR: [%s:%u] - %s", BASE_FILE, __LINE__, last_error_message); \ } while (0) #define security_error_printf(format, ...) \ @@ -239,6 +240,7 @@ static int handshake_wrapper(int sockfd, handshake_protocol_t *hdata, uint64_t s return_result = result; goto done; } + for (;;) { result = handshake_main(sockfd, hdata, session_id, is_server); @@ -786,8 +788,14 @@ static int reliable_read(int fd, void *buf, size_t size) } result = read(fd, ((unsigned char *) buf) + bytes_read, size - bytes_read); if (result <= 0) { - error_printf("Expected error return %d when reading from socket: %s\n", result, +#if 1 /*Kento modified*/ + /*During handshaking, connection can be dropped. So handle this as debug print */ + debug_printf("Expected error return %d when reading from socket: %s\n", result, + strerror(errno)); +#else + error_printf("Expected error return %d when reading from socket: %s\n", result, strerror(errno)); +#endif return HSHAKE_INTERNAL_ERROR; } else diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index bbef99a1..35678046 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -39,6 +39,8 @@ extern "C" { #include "handshake.h" #define COBO_SUCCESS (0) +#define COBO_PRIMARY_TREE (0) +#define COBO_FOREST (-1) #define COBO_NAMESPACE ldcs @@ -148,8 +150,14 @@ int cobo_server_get_root_socket(int* fd); extern double __cobo_ts; -int cobo_get_num_childs(int* num_childs); +int cobo_get_num_tree(int *num_trees); +int cobo_get_forest_child_socket(int tree_id, int num, int *fd); +int cobo_get_num_forest_childs(int tree_id, int* num_childs); +int cobo_get_forest_parent_socket(int tree_id, int *fd); +int cobo_get_forest_parent_socket_at(int num, int *fd); +int cobo_get_num_forest_parents(int tree_id, int *num_parents); +int cobo_get_num_childs(int* num_childs); /* Methods to access child fds */ int cobo_get_child_socket(int num, int *fd); diff --git a/src/logging/spindle_debug.h b/src/logging/spindle_debug.h index 65429987..c8c796b9 100644 --- a/src/logging/spindle_debug.h +++ b/src/logging/spindle_debug.h @@ -20,7 +20,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include -#define LOGD_DEBUG +#define LOGD_DEBUG #if defined(LOGD_DEBUG) #define LDCSDEBUG 1 @@ -31,12 +31,14 @@ extern "C" { #if defined(__cplusplus) } #endif + #elif defined(DEBUG) #define LDCSDEBUG 1 #define debug_printf(format, ...) \ do { \ fprintf(stderr, "[%s:%u@%d] - " format, __FILE__, __LINE__, getpid(), ## __VA_ARGS__); \ } while (0) + #elif defined(SIONDEBUG) #define LDCSDEBUG 1 #include "sion_debug.h" @@ -44,8 +46,10 @@ extern "C" { do { \ sion_dprintfp(32, __FILE__, getpid(), "[L%04u, %12.2f] - " format, __LINE__,_sion_get_time(), ## __VA_ARGS__); \ } while (0) + #else #define debug_printf(format, ...) + #endif #if defined(LOGD_DEBUG) @@ -66,4 +70,17 @@ extern "C" { #define err_printf(S, ...) debug_printf(S, ## __VA_ARGS__) #endif + +#define FOREST_DEBUG +#if defined(FOREST_DEBUG) +#define cobo_dbg_printf(format, ...) \ + do { \ + fprintf(stderr, "COBO:%6d: " format " (%s:%d)\n", getpid(), ## __VA_ARGS__, __FILE__, __LINE__); \ + } while (0) + +#else +#define cobo_dbg_printf(format, ...) +#define md_cobo_dbg_printf(format, ...) +#endif + #endif diff --git a/src/server/auditserver/ldcs_audit_server_filemngt.c b/src/server/auditserver/ldcs_audit_server_filemngt.c index d25a8064..51767411 100644 --- a/src/server/auditserver/ldcs_audit_server_filemngt.c +++ b/src/server/auditserver/ldcs_audit_server_filemngt.c @@ -34,7 +34,6 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ldcs_api_listen.h" #include "ldcs_audit_server_process.h" #include "ldcs_audit_server_filemngt.h" -#include "ldcs_elf_read.h" #include "config.h" #if !defined(LIBEXECDIR) @@ -58,6 +57,8 @@ int ldcs_audit_server_filemngt_init (char* location) { return(rc); } +extern int read_file_and_strip(FILE *f, void *data, size_t *size); + char *filemngt_calc_localname(char *global_name) { static unsigned int unique_str_num = 0; @@ -107,7 +108,15 @@ int filemngt_read_file(char *filename, void *buffer, size_t *size, int strip) return -1; } - result = read_file_and_strip(f, buffer, size, strip); + if (strip) { + result = read_file_and_strip(f, buffer, size); + } + else { + do { + result = fread(buffer, 1, *size, f); + } while (result == -1 && errno == EINTR); + result = (result == *size) ? 0 : -1; + } if (result == -1) err_printf("Error reading from file %s: %s\n", filename, strerror(errno)); diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 25f6f107..4a6f4f5d 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -537,15 +537,18 @@ static void *handle_setup_file_buffer(ldcs_process_data_t *procdata, char *pathn cresult = ldcs_cache_findFileDirInCache(filename, dirname, localname); if (cresult == LDCS_CACHE_FILE_FOUND && *localname) { debug_printf3("File %s was already in cache with localname %s\n", pathname, *localname); + // cobo_dbg_printf("File %s (dir: %s) was already in cache with localname %s", pathname, dirname, *localname); *already_loaded = 1; return NULL; } else if (cresult == LDCS_CACHE_FILE_FOUND) { debug_printf3("File %s was in cache, but not stored on local disk\n", pathname); + // cobo_dbg_printf("File %s was in cache, but not stored on local disk", pathname); *already_loaded = 0; } else if (cresult == LDCS_CACHE_FILE_NOT_FOUND) { debug_printf3("File %s wasn't in cache\n", pathname); + // cobo_dbg_printf("File %s wasn't in cache", pathname); ldcs_cache_addFileDir(dirname, filename); *already_loaded = 0; } @@ -630,9 +633,9 @@ static int handle_read_and_broadcast_file(ldcs_process_data_t *procdata, char *p /* Setup buffer for file contents */ buffer = handle_setup_file_buffer(procdata, pathname, size, &fd, &localname, &already_loaded); if (!buffer) { - assert(!already_loaded); - global_result = -1; - goto done; + assert(!already_loaded); + global_result = -1; + goto done; } /* Actually read the file into the buffer */ @@ -658,6 +661,7 @@ static int handle_read_and_broadcast_file(ldcs_process_data_t *procdata, char *p goto done; } + /* distribute file data */ if (bcast != suppress_broadcast) { result = handle_broadcast_file(procdata, pathname, buffer, newsize, bcast); @@ -798,6 +802,7 @@ static int handle_exit_broadcast(ldcs_process_data_t *procdata) out_msg.header.len = 0; out_msg.data = NULL; + procdata->md_path = NULL; ldcs_audit_server_md_broadcast(procdata, &out_msg); mark_exit(); @@ -819,11 +824,13 @@ static int handle_request(ldcs_process_data_t *procdata, node_peer_t from, ldcs_ err_printf("Badly formed request message with starting char '%c'\n", msg_type); return -1; } + procdata->md_path = pathname; is_dir = (msg_type == 'D'); if (is_dir) return handle_request_directory(procdata, from, pathname); else return handle_request_file(procdata, from, pathname); + procdata->md_path = NULL; return result; } @@ -1002,8 +1009,12 @@ static int handle_send_directory_query(ldcs_process_data_t *procdata, char *dire bytes_written = snprintf(out_msg.data, MAX_PATH_LEN+1, "D%s", directory); out_msg.header.len = bytes_written+1; - + procdata->md_path = directory; +#ifdef LDCS_DBG + cobo_dbg_printf("handle_send_directory_query: %s (%s)", procdata->md_path, directory); +#endif ldcs_audit_server_md_forward_query(procdata, &out_msg); + procdata->md_path = NULL; return 0; } @@ -1023,7 +1034,13 @@ static int handle_send_file_query(ldcs_process_data_t *procdata, char *fullpath) bytes_written = snprintf(out_msg.data, MAX_PATH_LEN+1, "F%s", fullpath); out_msg.header.len = bytes_written+1; + + procdata->md_path = fullpath; +#ifdef LDCS_DBG + cobo_dbg_printf("handle_send_file_query: %s (%s)", procdata->md_path, fullpath); +#endif ldcs_audit_server_md_forward_query(procdata, &out_msg); + procdata->md_path = NULL; return 0; } @@ -1155,7 +1172,9 @@ int handle_send_msg_to_keys(ldcs_process_data_t *procdata, ldcs_message_t *msg, if (procdata->dist_model == LDCS_PUSH || force_broadcast) { debug_printf3("Pushing message to all children\n"); + procdata->md_path = key; result = ldcs_audit_server_md_broadcast_noncontig(procdata, msg, secondary_data, secondary_size); + procdata->md_path = NULL; if (result == -1) global_result = -1; have_done_broadcast = 1; @@ -1231,34 +1250,49 @@ int handle_server_message(ldcs_process_data_t *procdata, node_peer_t peer, ldcs_ { switch (msg->header.type) { case LDCS_MSG_CACHE_ENTRIES: + // md_cobo_dbg_printf("LDCS_MSG_CACHE_ENTRIES"); return handle_directory_recv(procdata, msg, request_broadcast); case LDCS_MSG_FILE_DATA: + // md_cobo_dbg_printf("LDCS_MSG_FILE_DATA"); return handle_file_recv(procdata, msg, peer, request_broadcast); case LDCS_MSG_FILE_REQUEST: + // md_cobo_dbg_printf("LDCS_MSG_FILE_REQUEST"); return handle_request(procdata, peer, msg); case LDCS_MSG_EXIT: + // md_cobo_dbg_printf("LDCS_MSG_EXIT"); return handle_exit_broadcast(procdata); case LDCS_MSG_PRELOAD_FILELIST: + // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_FILELIST"); return handle_preload_filelist(procdata, msg); case LDCS_MSG_PRELOAD_DIR: + // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_DIR"); return handle_directory_recv(procdata, msg, preload_broadcast); case LDCS_MSG_PRELOAD_FILE: + // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_FILE"); return handle_file_recv(procdata, msg, peer, preload_broadcast); case LDCS_MSG_PRELOAD_DONE: + // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_DONE"); return handle_preload_done(procdata); case LDCS_MSG_SELFLOAD_FILE: + // md_cobo_dbg_printf("LDCS_MSG_SELFLOAD_FILE"); return handle_recv_selfload_file(procdata, msg); case LDCS_MSG_STAT_NET_RESULT: + // md_cobo_dbg_printf("LDCS_MSG_STAT_NET_RESULT"); return handle_metadata_recv(procdata, msg, metadata_stat, peer); case LDCS_MSG_STAT_NET_REQUEST: + // md_cobo_dbg_printf("LDCS_MSG_STAT_NET_REQUEST"); return handle_metadata_request_recv(procdata, msg, metadata_stat, peer); case LDCS_MSG_LOADER_DATA_NET_RESP: + // md_cobo_dbg_printf("LDCS_MSG_LOADER_DATA_NET_RESP"); return handle_metadata_recv(procdata, msg, metadata_loader, peer); case LDCS_MSG_LOADER_DATA_NET_REQ: + // md_cobo_dbg_printf("LDCS_MSG_LOADER_DATA_NET_REQ"); return handle_metadata_request_recv(procdata, msg, metadata_loader, peer); case LDCS_MSG_EXIT_READY: + // md_cobo_dbg_printf("LDCS_MSG_EXIT_READY"); return handle_exit_ready_msg(procdata, msg); case LDCS_MSG_EXIT_CANCEL: + // md_cobo_dbg_printf("LDCS_MSG_EXIT_CANCEL"); return handle_exit_cancel_msg(procdata, msg); default: err_printf("Received unexpected message from node: %d\n", (int) msg->header.type); @@ -1382,6 +1416,7 @@ static int handle_preload_done(ldcs_process_data_t *procdata) done_msg.header.len = 0; done_msg.data = NULL; + procdata->md_path = NULL; result = ldcs_audit_server_md_broadcast(procdata, &done_msg); if (result == -1) { err_printf("Error broadcasting done message during preload\n"); @@ -1409,6 +1444,7 @@ static int handle_recv_selfload_file(ldcs_process_data_t *procdata, ldcs_message int result, nc, global_result = 0, found_client = 0; debug_printf("Recieved notice to selfload file %s\n", filename); + result = handle_send_msg_to_keys(procdata, msg, filename, NULL, 0, request_broadcast, 0); if (result == -1) { err_printf("Could not send selfload file message\n"); @@ -1745,6 +1781,7 @@ static int handle_broadcast_metadata(ldcs_process_data_t *procdata, char *pathna /* Send packet on network */ starttime = ldcs_get_time(); + result = handle_send_msg_to_keys(procdata, &msg, pathname, NULL, 0, 0, 1); procdata->server_stat.libdist.cnt++; procdata->server_stat.libdist.bytes += packet_size; @@ -1858,6 +1895,7 @@ static int handle_metadata_request(ldcs_process_data_t *procdata, char *pathname { ldcs_message_t msg; int pathlen; + int ret; if (been_requested(procdata->pending_metadata_requests, pathname)) { debug_printf2("Metadata %s has already been requested. Not resending request\n", pathname); @@ -1877,7 +1915,13 @@ static int handle_metadata_request(ldcs_process_data_t *procdata, char *pathname msg.header.len = pathlen; msg.data = pathname; - return ldcs_audit_server_md_forward_query(procdata, &msg); + procdata->md_path = pathname; +#ifdef LDCS_DBG + cobo_dbg_printf("handle_metadata_request: %s", procdata->md_path); +#endif + ret = ldcs_audit_server_md_forward_query(procdata, &msg); + procdata->md_path = NULL; + return ret; } /** @@ -1999,6 +2043,10 @@ static int handle_send_exit_ready_if_done(ldcs_process_data_t *procdata) } else { debug_printf2("Sending exit ready message to parent\n"); + procdata->md_path = NULL; +#ifdef LDCS_DBG + cobo_dbg_printf("handle_send_exit_ready_if_done: %s", procdata->md_path); +#endif return ldcs_audit_server_md_forward_query(procdata, &msg); } } @@ -2054,5 +2102,9 @@ static int handle_send_exit_cancel(ldcs_process_data_t *procdata) msg.header.len = 0; msg.data = NULL; + procdata->md_path = NULL; +#ifdef LDCS_DBG + cobo_dbg_printf("handle_send_exit_cancel: %s", procdata->md_path); +#endif return ldcs_audit_server_md_forward_query(procdata, &msg); } diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index da692408..45ff67d5 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -101,6 +101,8 @@ int ldcs_audit_server_md_send_noncontig(ldcs_process_data_t *ldcs_process_data, int ldcs_audit_server_md_broadcast_noncontig(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg, void *secondary_data, size_t secondary_size); +void ldcs_audit_server_md_barrier(); + int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); #if defined(__cplusplus) diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 65da5506..8f66806d 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -33,13 +33,90 @@ #include "cobo_comm.h" #include "config.h" + int ldcs_audit_server_md_cobo_CB ( int fd, int nc, void *data ); int ldcs_audit_server_md_cobo_send_msg ( int fd, ldcs_message_t *msg ); extern unique_id_t unique_id; +int cobo_rank = -1; +int cobo_size = -1; +int spindle_root_count = -1; +int spindle_root_hop = -1; + extern int ll_read(int fd, void *buf, size_t count); +static void ldcs_audit_server_md_backtrace_print() +{ + int j, nptrs; + void *buffer[100]; + char **strings; + + nptrs = backtrace(buffer, 100); + + /* backtrace_symbols_fd(buffer, nptrs, STDOUT_FILENO)*/ + strings = backtrace_symbols(buffer, nptrs); + if (strings == NULL) { + perror("backtrace_symbols"); + exit(EXIT_FAILURE); + } + + /* + You can translate the address to function name by + addr2line -f -e ./a.out
+ */ + for (j = 0; j < nptrs; j++) { + cobo_dbg_printf(" %s", strings[j]); + } + free(strings); + + return; +} + +static unsigned int ldcs_audit_server_md_hashval(char* str) +{ + unsigned int hash = 5381; + unsigned int c; + while ((c = *str++)) { + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + return hash; +} + +//static int ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data_t *ldcs_process_data) +static int ldcs_audit_server_md_get_responsible_tree_id(char *path) +{ + int responsible_tree_id = COBO_PRIMARY_TREE; + int ret; + struct stat st; + char *dir, *path_dup; + + /*kento*/ + /*TODO: Can I use message header for this if-else to find out + if tree_id is for management communication or library broadcast ? */ + if (path != NULL && strlen(path) > 0) { + //responsible_tree_id = spindle_root_count -1;//COBO_PRIMARY_TREE; + path_dup = strdup(path); + ret = stat(path_dup, &st); + if ((st.st_mode & S_IFMT) == S_IFDIR) { + /* path is directory */ + dir = path_dup; + } else { + /* path not is directory */ + dir = dirname(path_dup); + } + responsible_tree_id = ldcs_audit_server_md_hashval(dir) % spindle_root_count; + // responsible_tree_id = ldcs_audit_server_md_hashval(path) % (spindle_root_count - 1); + // responsible_tree_id++; +#ifdef LDCS_DBG + cobo_dbg_printf("%s -> %d", path, responsible_tree_id); +#endif + // ldcs_audit_server_md_backtrace_print(); + free(path_dup); + } + return responsible_tree_id; +} + int read_msg(int fd, node_peer_t *peer, ldcs_message_t *msg) { int result; @@ -84,6 +161,8 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, unsigned int *portlist; int my_rank, ranks, fanout; int i; + char* env; + portlist = malloc(sizeof(unsigned int) * (num_ports + 1)); for (i = 0; i < num_ports; i++) { @@ -100,11 +179,12 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, debug_printf2("cobo_open complete. Cobo rank %d/%d\n", my_rank, ranks); free(portlist); - data->server_stat.md_rank = data->md_rank = my_rank; - data->server_stat.md_size = data->md_size = ranks; + cobo_rank = data->server_stat.md_rank = data->md_rank = my_rank; + cobo_size = data->server_stat.md_size = data->md_size = ranks; data->md_listen_to_parent = 0; - cobo_get_num_childs(&fanout); + // cobo_get_num_childs(&fanout); + cobo_get_num_forest_childs(COBO_PRIMARY_TREE, &fanout); data->server_stat.md_fan_out = data->md_fan_out = fanout; cobo_barrier(); @@ -112,87 +192,228 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, /* send ack about being ready */ if (data->md_rank == 0) { int root_fd, ack=13; - /* send fe client signal to stop (ack) */ - cobo_get_parent_socket(&root_fd); + // cobo_get_parent_socket(&root_fd); + cobo_get_forest_parent_socket(COBO_PRIMARY_TREE, &root_fd); ldcs_cobo_write_fd(root_fd, &ack, sizeof(ack)); debug_printf3("sent FE client signal that server are ready %d\n",ack); } - + if (NULL == (env = getenv("SPINDLE_ROOT_COUNT"))) { + spindle_root_count = 1; + } else { + if (atoi(env) == 0) { + spindle_root_count = 1; + } else { + spindle_root_count = atoi(env); + } + } + + if (spindle_root_count > ranks || spindle_root_count <= 0) { + cobo_dbg_printf("spindle_root_count(%d) error", spindle_root_count); + err_printf("spindle_root_count(%d) error", spindle_root_count); + exit(1); + } + spindle_root_hop = ranks / spindle_root_count; + if (cobo_rank == 0) { + cobo_dbg_printf("root_count: %d root_hop: %d", spindle_root_count, spindle_root_hop); + } + return(rc); } +void ldcs_audit_server_md_barrier() +{ + cobo_barrier(); + return; +} + + +/* int ldcs_audit_server_md_register_fd ( ldcs_process_data_t *ldcs_process_data ) { */ +/* int rc=0, i; */ +/* int parent_fd, child_fd; */ +/* int num_childs; */ + +/* /\* Registering parents *\/ */ +/* if(cobo_get_parent_socket(&parent_fd)!=COBO_SUCCESS) { */ +/* err_printf("Error, could not get parent socket\n"); */ +/* assert(0); */ +/* } */ + +/* debug_printf3("Registering fd %d for cobo parent connection\n",parent_fd); */ +/* ldcs_listen_register_fd(parent_fd, 0, &ldcs_audit_server_md_cobo_CB, (void *) ldcs_process_data); */ +/* ldcs_process_data->md_listen_to_parent=1; */ + +/* /\* Registering childs *\/ */ +/* cobo_get_num_childs(&num_childs); */ +/* for (i = 0; imd_rank == 0) { + if(cobo_get_forest_parent_socket(COBO_PRIMARY_TREE, &parent_fd)!=COBO_SUCCESS) { + err_printf("Error, could not get parent socket\n"); + assert(0); + } + debug_printf3("Registering fd %d for cobo parent connection\n",parent_fd); + ldcs_listen_register_fd(parent_fd, 0, &ldcs_audit_server_md_cobo_CB, (void *) ldcs_process_data); + } ldcs_process_data->md_listen_to_parent=1; - cobo_get_num_childs(&num_childs); + /* Registering childs */ + cobo_get_num_forest_childs(COBO_FOREST, &num_childs); for (i = 0; imd_listen_to_parent) { */ +/* if(cobo_get_parent_socket(&parent_fd)!=COBO_SUCCESS) { */ +/* _error("cobo internal error (parent socket)"); */ +/* } */ +/* ldcs_process_data->md_listen_to_parent=0; */ +/* ldcs_listen_unregister_fd(parent_fd); */ + +/* cobo_get_num_childs(&num_childs); */ +/* for (i = 0; imd_listen_to_parent) { - if(cobo_get_parent_socket(&parent_fd)!=COBO_SUCCESS) { - _error("cobo internal error (parent socket)"); - } - ldcs_process_data->md_listen_to_parent=0; - ldcs_listen_unregister_fd(parent_fd); - - cobo_get_num_childs(&num_childs); - for (i = 0; imd_listen_to_parent) return rc; + + /* Registering parents */ + cobo_get_num_forest_parents(COBO_FOREST, &num_parents); + for (i = 0; i < num_parents; i++) { + if(cobo_get_forest_parent_socket_at(i, &parent_fd)!=COBO_SUCCESS) { + err_printf("Error, could not get parent socket\n"); + assert(0); + } + debug_printf3("Registering fd %d for cobo parent connection\n",parent_fd); + ldcs_listen_unregister_fd(parent_fd); } + /* Registering spindle_fe_main parent */ + if (ldcs_process_data->md_rank == 0) { + if(cobo_get_forest_parent_socket(COBO_PRIMARY_TREE, &parent_fd)!=COBO_SUCCESS) { + err_printf("Error, could not get spindle_fe parent socket\n"); + assert(0); + } + debug_printf3("Registering fd %d for cobo spindle_fe parent connection\n",parent_fd); + ldcs_listen_unregister_fd(parent_fd); + } + ldcs_process_data->md_listen_to_parent=0; + + /* Registering childs */ + cobo_get_num_forest_childs(COBO_FOREST, &num_childs); + for (i = 0; imd_rank == 0) { - debug_printf3("Decided I am responsible for file %s\n", filename); - return 1; - } else { - debug_printf3("Decided I am not responsible for file %s\n", filename); - return 0; - } + // if(ldcs_process_data->md_rank % spindle_root_hop == 0) { + + // cobo_dbg_printf("is_responsible"); + int responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(filename); + +#ifdef LDCS_DBG + cobo_dbg_printf("heward: %s, rank: %d (tree_id: %d)", filename, ldcs_process_data->md_rank, responsible_tree_id); +#endif + + + // cobo_dbg_printf("filename: <%s>, path: <%s> => %d", filename, ldcs_process_data->md_path, responsible_tree_id); + // cobo_dbg_printf("Decided I am responsible for file <%s>: res_tree_id: %d", filename, responsible_tree_id); + /* if (responsible_tree_id != 1) { */ + /* cobo_dbg_printf("filename: <%s>, path: <%s> => %d", filename, ldcs_process_data->md_path, responsible_tree_id); */ + /* exit(1); */ + /* } */ + + // if(ldcs_process_data->md_rank == 1) { + if(ldcs_process_data->md_rank == responsible_tree_id) { + cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); + debug_printf3("Decided I am responsible for file %s\n", filename); + return 1; + } else { + // if (ldcs_process_data->md_path != NULL) { + // cobo_dbg_printf("I am not responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); + debug_printf3("Decided I am not responsible for file %s\n", filename); + return 0; + } } int ldcs_audit_server_md_forward_query(ldcs_process_data_t *ldcs_process_data, ldcs_message_t* msg) { int parent_fd; int result; - if (ldcs_process_data->md_rank == 0) { + int responsible_tree_id; + + // cobo_dbg_printf("%s", __func__); + responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); +#ifdef LDCS_DBG + cobo_dbg_printf("upward: %s (tree_id: %d)", ldcs_process_data->md_path, responsible_tree_id); +#endif + + if (ldcs_process_data->md_rank == responsible_tree_id) { /* We're root--no one to forward a query to*/ return 0; } - cobo_get_parent_socket(&parent_fd); + cobo_get_forest_parent_socket(responsible_tree_id, &parent_fd); result = write_msg(parent_fd, msg); if (result < 0) { err_printf("Problem writing message to parent, result is %d\n", result); @@ -237,7 +458,7 @@ int ldcs_audit_server_md_recv_from_parent(ldcs_message_t *msg) int fd; node_peer_t peer; - cobo_get_parent_socket(&fd); + cobo_get_forest_parent_socket(COBO_PRIMARY_TREE, &fd); return read_msg(fd, &peer, msg); } @@ -312,13 +533,23 @@ int ldcs_audit_server_md_broadcast(ldcs_process_data_t *ldcs_process_data, ldcs_ int fd, i; int result, global_result = 0; int num_childs = 0; + int responsible_tree_id = 0; + + // cobo_dbg_printf("%s", __func__); + responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); +#ifdef LDCS_DBG + cobo_dbg_printf("dwward: %s, type: %d (tree_id: %d) bcast", ldcs_process_data->md_path, responsible_tree_id, + msg->header.type); +#endif + - cobo_get_num_childs(&num_childs); + + cobo_get_num_forest_childs(responsible_tree_id, &num_childs); for (i = 0; imd_path); +#endif + return ldcs_audit_server_md_broadcast(ldcs_process_data, msg); + } - cobo_get_num_childs(&num_childs); + // cobo_dbg_printf("%s", __func__); +#ifdef LDCS_DBG + cobo_dbg_printf("dwward: %s, type: %d (tree_id: %d) bcas noncontigt", ldcs_process_data->md_path, responsible_tree_id, + msg->header.type); +#endif + responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); + cobo_get_num_forest_childs(responsible_tree_id, &num_childs); for (i = 0; inumber; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.md_port = args->port; + ldcs_process_data.md_path = NULL; ldcs_process_data.opts = args->opts; ldcs_process_data.pending_requests = new_requestor_list(); ldcs_process_data.completed_requests = new_requestor_list(); @@ -218,6 +225,16 @@ int ldcs_audit_server_run() /* Statistic functions */ int _ldcs_server_stat_print ( ldcs_server_stat_t *server_stat ) { int rc=0; + /* cobo_dbg_printf("SERVER[%02d] STAT: #conn=%2d md_size=%2d md_fan_out=%2d listen_time=%8.4f select_time=%8.4f ts_first_connect=%16.6f hostname=%s", */ + /* server_stat->md_rank, */ + /* server_stat->num_connections, */ + /* server_stat->md_size, */ + /* server_stat->md_fan_out, */ + /* server_stat->listen_time, */ + /* server_stat->select_time, */ + /* server_stat->starttime, */ + /* server_stat->hostname ); */ + debug_printf("SERVER[%02d] STAT: #conn=%2d md_size=%2d md_fan_out=%2d listen_time=%8.4f select_time=%8.4f ts_first_connect=%16.6f hostname=%s\n", server_stat->md_rank, server_stat->num_connections, @@ -230,54 +247,119 @@ int ldcs_audit_server_run() #define MYFORMAT "SERVER[%02d] STAT: %-10s, #cnt=%5d, bytes=%8.2f MB, time=%8.4f sec\n" + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"libread", */ + /* server_stat->libread.cnt, */ + /* server_stat->libread.bytes/1024.0/1024.0, */ + /* server_stat->libread.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"libstore", */ + /* server_stat->libstore.cnt, */ + /* server_stat->libstore.bytes/1024.0/1024.0, */ + /* server_stat->libstore.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"libdist", */ + /* server_stat->libdist.cnt, */ + /* server_stat->libdist.bytes/1024.0/1024.0, */ + /* server_stat->libdist.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"procdir", */ + /* server_stat->procdir.cnt, */ + /* server_stat->procdir.bytes/1024.0/1024.0, */ + /* server_stat->procdir.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"distdir", */ + /* server_stat->distdir.cnt, */ + /* server_stat->distdir.bytes/1024.0/1024.0, */ + /* server_stat->distdir.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"client_cb", */ + /* server_stat->client_cb.cnt, */ + /* server_stat->client_cb.bytes/1024.0/1024.0, */ + /* server_stat->client_cb.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"server_cb", */ + /* server_stat->server_cb.cnt, */ + /* server_stat->server_cb.bytes/1024.0/1024.0, */ + /* server_stat->server_cb.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"md_cb", */ + /* server_stat->md_cb.cnt, */ + /* server_stat->md_cb.bytes/1024.0/1024.0, */ + /* server_stat->md_cb.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"cl_msg_avg", */ + /* server_stat->clientmsg.cnt/((server_stat->num_connections>0)?server_stat->num_connections:1), */ + /* server_stat->clientmsg.bytes/1024.0/1024.0, */ + /* server_stat->clientmsg.time/((server_stat->num_connections>0)?server_stat->num_connections:1) ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"bcast", */ + /* server_stat->bcast.cnt, */ + /* server_stat->bcast.bytes/1024.0/1024.0, */ + /* server_stat->bcast.time ); */ + /* cobo_dbg_printf(MYFORMAT, */ + /* server_stat->md_rank,"preload_cb", */ + /* server_stat->preload.cnt, */ + /* server_stat->preload.bytes/1024.0/1024.0, */ + /* server_stat->preload.time ); */ + + debug_printf(MYFORMAT, server_stat->md_rank,"libread", server_stat->libread.cnt, server_stat->libread.bytes/1024.0/1024.0, server_stat->libread.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"libstore", server_stat->libstore.cnt, server_stat->libstore.bytes/1024.0/1024.0, server_stat->libstore.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"libdist", server_stat->libdist.cnt, server_stat->libdist.bytes/1024.0/1024.0, server_stat->libdist.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"procdir", server_stat->procdir.cnt, server_stat->procdir.bytes/1024.0/1024.0, server_stat->procdir.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"distdir", server_stat->distdir.cnt, server_stat->distdir.bytes/1024.0/1024.0, server_stat->distdir.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"client_cb", server_stat->client_cb.cnt, server_stat->client_cb.bytes/1024.0/1024.0, server_stat->client_cb.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"server_cb", server_stat->server_cb.cnt, server_stat->server_cb.bytes/1024.0/1024.0, server_stat->server_cb.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"md_cb", server_stat->md_cb.cnt, server_stat->md_cb.bytes/1024.0/1024.0, server_stat->md_cb.time ); + debug_printf(MYFORMAT, server_stat->md_rank,"cl_msg_avg", server_stat->clientmsg.cnt/((server_stat->num_connections>0)?server_stat->num_connections:1), diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index fe06a924..7d954abf 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -115,7 +115,7 @@ struct ldcs_process_data_struct char *pythonprefix; int number; int preload_done; - opt_t opts; + unsigned int opts; requestor_list_t pending_requests; requestor_list_t completed_requests; requestor_list_t pending_metadata_requests; @@ -127,6 +127,7 @@ struct ldcs_process_data_struct int md_fan_out; /* number of childs */ int md_listen_to_parent; unsigned int md_port; + char *md_path; /* statistics */ ldcs_server_stat_t server_stat; From 3f3d43d0f8007ec492644589d3ddafe7755ee6fa Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 11:11:06 -0700 Subject: [PATCH 02/11] Added --roots argument --- config.h.in | 3 ++ configure | 17 ++++++++++ configure.common.ac | 6 ++++ src/client/config.h.in | 3 ++ src/client/configure | 17 ++++++++++ src/cobo/cobo.c | 34 ++++++++++++++----- src/cobo/ldcs_cobo.h | 4 +++ src/fe/config.h.in | 3 ++ src/fe/configure | 17 ++++++++++ src/fe/startup/parseargs.cc | 19 +++++++++++ src/fe/startup/spindle_fe.cc | 3 +- src/fe/startup/spindle_fe_main.cc | 1 - src/include/spindle_launch.h | 3 ++ src/server/auditserver/ldcs_audit_server_md.h | 3 ++ .../auditserver/ldcs_audit_server_md_cobo.c | 27 ++++++++------- .../auditserver/ldcs_audit_server_process.c | 8 +++++ .../auditserver/ldcs_audit_server_process.h | 2 ++ src/server/config.h.in | 3 ++ src/server/configure | 17 ++++++++++ src/server/startup/spindle_be.cc | 7 ++++ 20 files changed, 173 insertions(+), 24 deletions(-) diff --git a/config.h.in b/config.h.in index 4360726e..58f096d7 100644 --- a/config.h.in +++ b/config.h.in @@ -100,6 +100,9 @@ /* The default port for Spindle */ #undef SPINDLE_PORT +/* The default number of roots in a cobo network */ +#undef SPINDLE_ROOTS + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS diff --git a/configure b/configure index 12cf007f..720fc745 100755 --- a/configure +++ b/configure @@ -819,6 +819,7 @@ enable_fast_install with_gnu_ld with_sysroot enable_libtool_lock +with_default_roots with_default_port with_default_num_ports with_localstorage @@ -1526,6 +1527,8 @@ Optional Packages: --with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-sysroot=DIR Search for dependent libraries within DIR (or the compiler's sysroot if not specified). + --with-default-roots=NUM + The number of roots in a cobo network --with-default-port=NUM TCP/IP Port for Spindle server communication --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server @@ -16192,11 +16195,20 @@ fi #Include common ops #Configure operations that are common between the Spindle and Spindle-client configurations #Network port and local storage location +DEFAULT_ROOTS=1 DEFAULT_PORT=21940 DEFAULT_LOC='$TMPDIR' DEFAULT_NUM_COBO_PORTS=25 +# Check whether --with-default-roots was given. +if test "${with_default_roots+set}" = set; then : + withval=$with_default_roots; SPINDLE_ROOTS=${withval} +else + SPINDLE_ROOTS=$DEFAULT_ROOTS +fi + + # Check whether --with-default-port was given. if test "${with_default_port+set}" = set; then : withval=$with_default_port; SPINDLE_PORT=${withval} @@ -16221,6 +16233,11 @@ else fi +cat >>confdefs.h <<_ACEOF +#define SPINDLE_ROOTS $SPINDLE_ROOTS +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_PORT $SPINDLE_PORT _ACEOF diff --git a/configure.common.ac b/configure.common.ac index 501a2de7..983ce67a 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -1,9 +1,14 @@ #Configure operations that are common between the Spindle and Spindle-client configurations #Network port and local storage location +DEFAULT_ROOTS=1 DEFAULT_PORT=21940 DEFAULT_LOC='$TMPDIR' DEFAULT_NUM_COBO_PORTS=25 +AC_ARG_WITH(default-roots, + [AS_HELP_STRING([--with-default-roots=NUM],[The number of roots in a cobo network])], + [SPINDLE_ROOTS=${withval}], + [SPINDLE_ROOTS=$DEFAULT_ROOTS]) AC_ARG_WITH(default-port, [AS_HELP_STRING([--with-default-port=NUM],[TCP/IP Port for Spindle server communication])], [SPINDLE_PORT=${withval}], @@ -16,6 +21,7 @@ AC_ARG_WITH(localstorage, [AS_HELP_STRING([--with-localstorage=DIR],[Directory on back-ends for storing relocated files])], [SPINDLE_LOC=${withval}], [SPINDLE_LOC=$DEFAULT_LOC]) +AC_DEFINE_UNQUOTED([SPINDLE_ROOTS],[$SPINDLE_ROOTS],[The default number of roots in a cobo network]) AC_DEFINE_UNQUOTED([SPINDLE_PORT],[$SPINDLE_PORT],[The default port for Spindle]) AC_DEFINE_UNQUOTED([NUM_COBO_PORTS],[$NUM_COBO_PORTS],[Number of ports for COBO to search for an open port]) AC_DEFINE_UNQUOTED([SPINDLE_MAX_PORT],[$(($SPINDLE_PORT + $NUM_COBO_PORTS - 1))],[The maximum port value]) diff --git a/src/client/config.h.in b/src/client/config.h.in index 1b497e6b..de8674b8 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -94,6 +94,9 @@ /* The default port for Spindle */ #undef SPINDLE_PORT +/* The default number of roots in a cobo network */ +#undef SPINDLE_ROOTS + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS diff --git a/src/client/configure b/src/client/configure index e648e9ad..12609032 100755 --- a/src/client/configure +++ b/src/client/configure @@ -784,6 +784,7 @@ enable_fast_install with_gnu_ld with_sysroot enable_libtool_lock +with_default_roots with_default_port with_default_num_ports with_localstorage @@ -1469,6 +1470,8 @@ Optional Packages: --with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-sysroot=DIR Search for dependent libraries within DIR (or the compiler's sysroot if not specified). + --with-default-roots=NUM + The number of roots in a cobo network --with-default-port=NUM TCP/IP Port for Spindle server communication --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server @@ -12401,11 +12404,20 @@ CC="$lt_save_CC" #Configure operations that are common between the Spindle and Spindle-client configurations #Network port and local storage location +DEFAULT_ROOTS=1 DEFAULT_PORT=21940 DEFAULT_LOC='$TMPDIR' DEFAULT_NUM_COBO_PORTS=25 +# Check whether --with-default-roots was given. +if test "${with_default_roots+set}" = set; then : + withval=$with_default_roots; SPINDLE_ROOTS=${withval} +else + SPINDLE_ROOTS=$DEFAULT_ROOTS +fi + + # Check whether --with-default-port was given. if test "${with_default_port+set}" = set; then : withval=$with_default_port; SPINDLE_PORT=${withval} @@ -12430,6 +12442,11 @@ else fi +cat >>confdefs.h <<_ACEOF +#define SPINDLE_ROOTS $SPINDLE_ROOTS +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_PORT $SPINDLE_PORT _ACEOF diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index a2ef697f..12832926 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -115,6 +115,7 @@ static int* cobo_child_incl = NULL; /* number of children each child is respons static int cobo_num_child_incl = 0; /* total number of children this node is responsible for */ /* forest data structures */ +static int cobo_is_forest_opened = 0; static int cobo_num_forest_childs = 0; /* number of clockwise direction peers (children) */ static int* cobo_forest_childs = NULL; /* ranks of clockwise direction peers**/ static int* cobo_forest_childs_fd = NULL; /* sockets to clockwise direction peers */ @@ -1227,7 +1228,7 @@ static int cobo_close_tree() } /* open socket forest */ -static int cobo_open_forest() +int cobo_open_forest() { int i; @@ -1314,7 +1315,7 @@ static int cobo_open_forest() // sleep(1000); // cobo_dbg_printf("============= %d", cobo_me); // exit(0); - + cobo_is_forest_opened = 1; return COBO_SUCCESS; } @@ -1587,7 +1588,12 @@ int cobo_get_forest_parent_socket_at(int num, int *fd) #ifdef COBO_FOREST int cobo_get_child_socket(int num, int *fd) { - cobo_get_forest_child_socket(0, num, fd); + if (cobo_is_forest_opened) { + cobo_get_forest_child_socket(0, num, fd); + } else { + assert(num < cobo_num_child); + *fd = cobo_child_fd[num]; + } return COBO_SUCCESS; } #else @@ -1601,7 +1607,11 @@ int cobo_get_child_socket(int num, int *fd) #ifdef COBO_FOREST int cobo_get_num_childs(int* num_childs) { - cobo_get_num_forest_childs(0, num_childs); + if (cobo_is_forest_opened) { + cobo_get_num_forest_childs(0, num_childs); + } else { + *num_childs=cobo_num_child; + } return COBO_SUCCESS; } #else @@ -1620,7 +1630,16 @@ int cobo_get_num_childs(int* num_childs) { #ifdef COBO_FOREST int cobo_get_parent_socket(int* fd) { - cobo_get_forest_parent_socket(0, fd); + if (cobo_is_forest_opened){ + cobo_get_forest_parent_socket(0, fd); + } else { + if (cobo_parent_fd != -1) { + *fd = cobo_parent_fd; + return COBO_SUCCESS; + } + return -1; /* failure RCs? */ + } + return COBO_SUCCESS; } #else int cobo_get_parent_socket(int* fd) @@ -1926,10 +1945,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* cobo_gettimeofday(&end); debug_printf3("Exiting cobo_init(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); - - /* open the forest */ - cobo_open_forest(); - + return COBO_SUCCESS; } diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index 35678046..6b9ffaf3 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -48,6 +48,7 @@ extern "C" { #define COMBINE2(a, b) a ## _ ## b #define COMBINE(a, b) COMBINE2(a, b) #define cobo_open COMBINE(COBO_NAMESPACE, cobo_open) +#define cobo_open_forest COMBINE(COBO_NAMESPACE, cobo_open_forest) #define cobo_close COMBINE(COBO_NAMESPACE, cobo_close) #define cobo_get_parent_socket COMBINE(COBO_NAMESPACE, cobo_get_parent_socket) #define cobo_barrier COMBINE(COBO_NAMESPACE, cobo_barrier) @@ -78,6 +79,9 @@ extern "C" { /* provide list of ports and number of ports as input, get number of tasks and my rank as output */ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int *num_ranks); +/* Convet form tree to forest */ +int cobo_open_forest(); + /* shut down the connections between tasks and free data structures */ int cobo_close(); diff --git a/src/fe/config.h.in b/src/fe/config.h.in index 60f65abf..9bfe536f 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -130,6 +130,9 @@ /* The default port for Spindle */ #undef SPINDLE_PORT +/* The default number of roots in a cobo network */ +#undef SPINDLE_ROOTS + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS diff --git a/src/fe/configure b/src/fe/configure index 05bd6846..948a1857 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -805,6 +805,7 @@ enable_fast_install with_gnu_ld with_sysroot enable_libtool_lock +with_default_roots with_default_port with_default_num_ports with_localstorage @@ -1504,6 +1505,8 @@ Optional Packages: --with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-sysroot=DIR Search for dependent libraries within DIR (or the compiler's sysroot if not specified). + --with-default-roots=NUM + The number of roots in a cobo network --with-default-port=NUM TCP/IP Port for Spindle server communication --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server @@ -16247,11 +16250,20 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu #Configure operations that are common between the Spindle and Spindle-client configurations #Network port and local storage location +DEFAULT_ROOTS=1 DEFAULT_PORT=21940 DEFAULT_LOC='$TMPDIR' DEFAULT_NUM_COBO_PORTS=25 +# Check whether --with-default-roots was given. +if test "${with_default_roots+set}" = set; then : + withval=$with_default_roots; SPINDLE_ROOTS=${withval} +else + SPINDLE_ROOTS=$DEFAULT_ROOTS +fi + + # Check whether --with-default-port was given. if test "${with_default_port+set}" = set; then : withval=$with_default_port; SPINDLE_PORT=${withval} @@ -16276,6 +16288,11 @@ else fi +cat >>confdefs.h <<_ACEOF +#define SPINDLE_ROOTS $SPINDLE_ROOTS +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_PORT $SPINDLE_PORT _ACEOF diff --git a/src/fe/startup/parseargs.cc b/src/fe/startup/parseargs.cc index aabe27ce..55c4b425 100644 --- a/src/fe/startup/parseargs.cc +++ b/src/fe/startup/parseargs.cc @@ -45,6 +45,7 @@ using namespace std; #define AUDITTYPE 'k' #define RELOCSO 'l' #define NOCLEAN 'n' +#define COBO_FOREST 'm' #define LOCATION 'o' #define PUSH 'p' #define PULL 'q' @@ -163,6 +164,7 @@ static const int DISABLE_LOGGING_FLAGS = OPTION_HIDDEN; #endif static bool logging_enabled = DEFAULT_LOGGING_ENABLED; +static unsigned int spindle_roots = SPINDLE_ROOTS; static unsigned int spindle_port = SPINDLE_PORT; static unsigned int num_ports = NUM_COBO_PORTS; @@ -193,6 +195,8 @@ struct argp_option options[] = { "These options configure Spindle's network model. Typical Spindle runs should not need to set these.", GROUP_NETWORK }, { "cobo", COBO, NULL, 0, "Use a tree-based cobo network for distributing objects", GROUP_NETWORK }, + { "roots", COBO_FOREST, "X", 0, + "The number of roots in a cobo network. Default: " STR(SPINDLE_ROOTS), GROUP_NETWORK }, { "port", PORT, "port1-port2", 0, "TCP/IP port range for Spindle servers. Default: " STR(SPINDLE_PORT) "-" STR(SPINDLE_MAX_PORT), GROUP_NETWORK }, { NULL, 0, NULL, 0, @@ -321,6 +325,15 @@ static int parse(int key, char *arg, struct argp_state *vstate) preload_file = arg; return 0; } + else if (entry->key == COBO_FOREST) { + int v = atoi(arg); + if (v <= 0) { + argp_error(state, "Roots was given a negative or 0 value"); + } else { + spindle_roots = v; + } + return 0; + } else if (entry->key == PORT) { spindle_port = atoi(arg); if (!spindle_port) { @@ -503,6 +516,11 @@ char *getPreloadFile() return preload_file; } +unsigned int getRoots() +{ + return spindle_roots; +} + unsigned int getPort() { return spindle_port; @@ -636,6 +654,7 @@ void parseCommandLine(int argc, char *argv[], spindle_args_t *args) opt_t opts = parseArgs(argc, argv); args->number = getpid(); + args->num_roots = getRoots(); args->port = getPort(); args->num_ports = getNumPorts(); args->opts = opts; diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index 863e6d1f..bbc0b2a0 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -60,7 +60,7 @@ void pack_param(char *value, char *buffer, unsigned int &pos) static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) { - buffer_size = sizeof(unsigned int) * 6; + buffer_size = sizeof(unsigned int) * 7; buffer_size += sizeof(opt_t); buffer_size += sizeof(unique_id_t); buffer_size += args->location ? strlen(args->location) + 1 : 1; @@ -70,6 +70,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) unsigned int pos = 0; char *buf = (char *) malloc(buffer_size); pack_param(args->number, buf, pos); + pack_param(args->num_roots, buf, pos); pack_param(args->port, buf, pos); pack_param(args->num_ports, buf, pos); pack_param(args->opts, buf, pos); diff --git a/src/fe/startup/spindle_fe_main.cc b/src/fe/startup/spindle_fe_main.cc index 3bfa31a6..37d67acd 100644 --- a/src/fe/startup/spindle_fe_main.cc +++ b/src/fe/startup/spindle_fe_main.cc @@ -77,7 +77,6 @@ int main(int argc, char *argv[]) bare_printf2("%s ", app_argv[i]); } bare_printf2("\n"); - if (params.use_launcher == serial_launcher) { debug_printf("Starting application in serial mode\n"); result = startSerialFE(app_argc, app_argv, daemon_argc, daemon_argv, ¶ms); diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 6244e5df..999f7e98 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -80,6 +80,9 @@ typedef struct { /* A unique number that will be used to identify this spindle session */ unsigned int number; + /* The number of roots in the cobo network */ + unsigned int num_roots; + /* The beginning port in a range that will be used for server->server communication */ unsigned int port; diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index 45ff67d5..5024f75a 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -59,6 +59,9 @@ typedef void* node_peer_t; /* Any initialization can be done here. */ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, unique_id_t unique_id, ldcs_process_data_t *data); +/* Any further initialization (after receiving spindle command arguments) can be done here. */ +int ldcs_audit_server_md_init_post_process(); + /* register_fd should, for every fd we want Spindle to recv messages on, call ldcs_listen_register_fd with the fd and a callback function to be triggered when a message arrives. */ diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 8f66806d..441583b0 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -199,16 +199,8 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, ldcs_cobo_write_fd(root_fd, &ack, sizeof(ack)); debug_printf3("sent FE client signal that server are ready %d\n",ack); } - if (NULL == (env = getenv("SPINDLE_ROOT_COUNT"))) { - spindle_root_count = 1; - } else { - if (atoi(env) == 0) { - spindle_root_count = 1; - } else { - spindle_root_count = atoi(env); - } - } - + // + spindle_root_count = 4;//data->md_roots; if (spindle_root_count > ranks || spindle_root_count <= 0) { cobo_dbg_printf("spindle_root_count(%d) error", spindle_root_count); err_printf("spindle_root_count(%d) error", spindle_root_count); @@ -218,10 +210,19 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, if (cobo_rank == 0) { cobo_dbg_printf("root_count: %d root_hop: %d", spindle_root_count, spindle_root_hop); } + // return(rc); } +int ldcs_audit_server_md_init_post_process(ldcs_process_data_t *ldcs_process_data) +{ + if (ldcs_process_data->md_roots > 1) { + return cobo_open_forest(); + } + return 1; +} + void ldcs_audit_server_md_barrier() { cobo_barrier(); @@ -386,13 +387,13 @@ int ldcs_audit_server_md_is_responsible ( ldcs_process_data_t *ldcs_process_data // if(ldcs_process_data->md_rank == 1) { if(ldcs_process_data->md_rank == responsible_tree_id) { - cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); - debug_printf3("Decided I am responsible for file %s\n", filename); + cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); + debug_printf3("Decided I am responsible for file %s\n", filename); return 1; } else { // if (ldcs_process_data->md_path != NULL) { // cobo_dbg_printf("I am not responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); - debug_printf3("Decided I am not responsible for file %s\n", filename); + // debug_printf3("Decided I am not responsible for file %s\n", filename); return 0; } } diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 6d2ef135..8b231ec5 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -106,6 +106,7 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_process_data.location = args->location; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; + ldcs_process_data.md_roots = args->num_roots; ldcs_process_data.md_port = args->port; ldcs_process_data.md_path = NULL; ldcs_process_data.opts = args->opts; @@ -162,6 +163,13 @@ int ldcs_audit_server_process(spindle_args_t *args) return 0; } +int ldcs_audit_server_network_post_setup() +{ + int result; + result = ldcs_audit_server_md_init_post_process(ldcs_process_data); + return result; +} + int ldcs_audit_server_run() { /* start loop */ diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 7d954abf..b78912b6 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -126,6 +126,7 @@ struct ldcs_process_data_struct int md_size; int md_fan_out; /* number of childs */ int md_listen_to_parent; + int md_roots; unsigned int md_port; char *md_path; @@ -137,6 +138,7 @@ typedef struct ldcs_process_data_struct ldcs_process_data_t; int ldcs_audit_server_network_setup(unsigned int port, unsigned int num_ports, unique_id_t unique_id, void **packed_setup_data, int *data_size); int ldcs_audit_server_process (spindle_args_t *args); +int ldcs_audit_server_network_post_setup(); int ldcs_audit_server_run(); #define CLIENT_CB_AUX_FD INT32_MAX diff --git a/src/server/config.h.in b/src/server/config.h.in index 879cef0d..d894a637 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -121,6 +121,9 @@ /* The default port for Spindle */ #undef SPINDLE_PORT +/* The default number of roots in a cobo network */ +#undef SPINDLE_ROOTS + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS diff --git a/src/server/configure b/src/server/configure index 49fa78d0..9f46b894 100755 --- a/src/server/configure +++ b/src/server/configure @@ -810,6 +810,7 @@ enable_fast_install with_gnu_ld with_sysroot enable_libtool_lock +with_default_roots with_default_port with_default_num_ports with_localstorage @@ -1503,6 +1504,8 @@ Optional Packages: --with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-sysroot=DIR Search for dependent libraries within DIR (or the compiler's sysroot if not specified). + --with-default-roots=NUM + The number of roots in a cobo network --with-default-port=NUM TCP/IP Port for Spindle server communication --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server @@ -16246,11 +16249,20 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu #Configure operations that are common between the Spindle and Spindle-client configurations #Network port and local storage location +DEFAULT_ROOTS=1 DEFAULT_PORT=21940 DEFAULT_LOC='$TMPDIR' DEFAULT_NUM_COBO_PORTS=25 +# Check whether --with-default-roots was given. +if test "${with_default_roots+set}" = set; then : + withval=$with_default_roots; SPINDLE_ROOTS=${withval} +else + SPINDLE_ROOTS=$DEFAULT_ROOTS +fi + + # Check whether --with-default-port was given. if test "${with_default_port+set}" = set; then : withval=$with_default_port; SPINDLE_PORT=${withval} @@ -16275,6 +16287,11 @@ else fi +cat >>confdefs.h <<_ACEOF +#define SPINDLE_ROOTS $SPINDLE_ROOTS +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_PORT $SPINDLE_PORT _ACEOF diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 5e2ad2df..88b2541b 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -50,6 +50,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) int pos = 0; char *buf = static_cast(buffer); unpack_param(args->number, buf, pos); + unpack_param(args->num_roots, buf, pos); unpack_param(args->port, buf, pos); unpack_param(args->num_ports, buf, pos); unpack_param(args->opts, buf, pos); @@ -155,6 +156,12 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i } } + result = ldcs_audit_server_network_post_setup(); + if (result == -1) { + err_printf("Error in ldcs_audit_server_network_post_setup"); + return -1; + } + debug_printf("Setup done. Running server.\n"); ldcs_audit_server_run(); if (result == -1) { From 6741dcf64d4f14a6b3eac9fe91372ea1ab2119d2 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 15:48:29 -0700 Subject: [PATCH 03/11] multi-roots --- src/cobo/cobo.c | 111 +++++++++--------- src/fe/startup/parseargs.cc | 2 +- src/logging/spindle_debug.h | 3 +- src/server/auditserver/ldcs_audit_server_md.h | 2 +- .../auditserver/ldcs_audit_server_md_cobo.c | 79 ++++--------- .../auditserver/ldcs_audit_server_process.c | 10 +- .../auditserver/ldcs_audit_server_process.h | 2 +- src/server/startup/spindle_be.cc | 13 +- 8 files changed, 96 insertions(+), 126 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 12832926..e4e9f88d 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -55,8 +55,6 @@ Place, Suite 330, Boston, MA 02111-1307 USA #define COBO_CONNECT_TIMELIMIT (600) /* seconds -- wait this long before giving up for good */ #endif -#define ENABLE_HANDSHAKE - #if defined(_IA64_) #undef htons #undef ntohs @@ -64,9 +62,6 @@ Place, Suite 330, Boston, MA 02111-1307 USA #define ntohs(__bsx) ((((__bsx) >> 8) & 0xff) | (((__bsx) & 0xff) << 8)) #endif -#define err_printf cobo_dbg_printf - - /* * ========================================================================== * ========================================================================== @@ -482,8 +477,6 @@ static int cobo_connect_hostname(char* hostname, int rank) saddr = *((struct in_addr *) (*he->h_addr_list)); } - - /* Loop until we make a connection or until our timeout expires. */ struct timeval start, end; cobo_gettimeofday(&start); @@ -505,7 +498,6 @@ static int cobo_connect_hostname(char* hostname, int rank) s = cobo_connect(saddr, htons(port), connect_timeout); if (s != -1) { /* got a connection, let's test it out */ - // cobo_dbg_printf("Connected to rank %d port %d on %s", rank, port, hostname); debug_printf3("Connected to rank %d port %d on %s\n", rank, port, hostname); int test_failed = 0; @@ -538,7 +530,6 @@ static int cobo_connect_hostname(char* hostname, int rank) assert(0 && "Unknown return value from handshake_server\n"); } - /* write cobo service id */ if (!test_failed && cobo_write_fd_w_suppress(s, &cobo_serviceid, sizeof(cobo_serviceid), 1) < 0) { debug_printf3("Writing service id to %s on port %d\n", @@ -988,7 +979,6 @@ static int cobo_open_tree() sockfd = cobo_create_socket(); cobo_bind_and_listen(sockfd); cobo_parent_fd = cobo_accept_and_handshake(sockfd); - // sleep(1); close(sockfd); #else /* create a socket to accept connection from parent IPPROTO_TCP */ @@ -1169,14 +1159,11 @@ static int cobo_open_tree() /* given our rank and the number of ranks, compute the ranks of our children */ cobo_compute_children(); /* cobo_compute_children_root_C1(); */ - /* for each child, open socket connection and forward hostname table */ for(i=0; i < cobo_num_child; i++) { /* get rank and hostname for this child */ int c = cobo_child[i]; char* child_hostname = cobo_expand_hostname(c); - - debug_printf3("%d: on COBO%02d: connect to child #%02d (%s)\n",i,cobo_me,c,child_hostname); /* connect to child */ cobo_child_fd[i] = cobo_connect_hostname(child_hostname, c); @@ -1197,9 +1184,7 @@ static int cobo_open_tree() /* free the child hostname string */ free(child_hostname); - } - return COBO_SUCCESS; } @@ -1230,18 +1215,7 @@ static int cobo_close_tree() /* open socket forest */ int cobo_open_forest() { - int i; - /* read our rank number */ - /* if (cobo_me == 0) { */ - /* int i; */ - /* cobo_dbg_printf("cobo_me: %d, cobo_nprocs: %d, cobo_hostlist_size: %d", cobo_me, cobo_nprocs, cobo_hostlist_size); */ - /* cobo_dbg_printf("cobo_hostlist:"); */ - /* for (i = 0; i < cobo_nprocs; i++) { */ - /* cobo_dbg_printf(" %s", cobo_expand_hostname(i)); */ - /* } */ - /* } */ - /* compute ranks of peers */ { int rank_offset = 1; @@ -1249,8 +1223,6 @@ int cobo_open_forest() cobo_num_forest_childs++; rank_offset = rank_offset << 1; } - - i = 0; rank_offset = 1; cobo_forest_childs = (int*)cobo_malloc(sizeof(int) * cobo_num_forest_childs, "clockwise peer buffer"); @@ -1261,19 +1233,9 @@ int cobo_open_forest() rank_offset = rank_offset << 1; i++; } - - /* if (cobo_me == 12) { */ - /* for (i = 0; i < cobo_num_forest_childs; i++) { */ - /* cobo_dbg_printf("-> %d", cobo_forest_childs[i]); */ - /* } */ - /* cobo_dbg_printf("======"); */ - /* for (i = 0; i < cobo_num_forest_childs; i++) { */ - /* cobo_dbg_printf("-> %d", cobo_forest_parents[i]); */ - /* } */ - /* } */ - } - - /* Create Binominal Forest overlay network */ + } + + /* Create Binomial Forest overlay network */ int sockfd; int num_successive_listen_rank_num = 1; cobo_forest_childs_fd = (int*)cobo_malloc(sizeof(int) * cobo_num_forest_childs, "clockwise peer fd buffer"); @@ -1293,28 +1255,18 @@ int cobo_open_forest() if (connection_group_id % 2 == 0) { /* Active connection => Passive connection */ - // cobo_dbg_printf("%d: connect to %d (Hop: %d)", cobo_me, conn_rank, num_successive_listen_rank_num); cobo_forest_childs_fd[i] = cobo_connect_rank(conn_rank); - // cobo_dbg_printf("%d: listen (Hop: %d)", cobo_me, num_successive_listen_rank_num); cobo_listen(sockfd); - // cobo_dbg_printf("%d: accept (Hop: %d)", cobo_me, num_successive_listen_rank_num); cobo_forest_parents_fd[i] = cobo_accept_and_handshake(sockfd); } else { /* Passive connection -> Active connection */ - // cobo_dbg_printf("%d: listen (Hop: %d)", cobo_me, num_successive_listen_rank_num); cobo_listen(sockfd); - // cobo_dbg_printf("%d: accept (Hop: %d)", cobo_me, num_successive_listen_rank_num); cobo_forest_parents_fd[i] = cobo_accept_and_handshake(sockfd); - // cobo_dbg_printf("%d: connect to %d (Hop: %d)", cobo_me, conn_rank, num_successive_listen_rank_num); cobo_forest_childs_fd[i] = cobo_connect_rank(conn_rank); } num_successive_listen_rank_num = num_successive_listen_rank_num << 1; } close(sockfd); - // cobo_dbg_printf("============= %d", cobo_me); - // sleep(1000); - // cobo_dbg_printf("============= %d", cobo_me); - // exit(0); cobo_is_forest_opened = 1; return COBO_SUCCESS; } @@ -1506,6 +1458,16 @@ int cobo_get_num_tree(int *num_trees) int cobo_get_forest_child_socket(int root, int num, int *fd) { int num_forest_childs; + + if (!cobo_is_forest_opened) { + if (root == COBO_PRIMARY_TREE) { + return cobo_get_child_socket(num, fd); + } else { + err_printf("Trying to use forest before cobo_open_forest"); + exit(1); + } + } + if (root == COBO_FOREST) { *fd = cobo_forest_childs_fd[num]; return COBO_SUCCESS; @@ -1526,6 +1488,15 @@ int cobo_get_num_forest_childs(int root, int* num_forest_childs) int logical_rank = 0; /*logical_rank of the root is 0*/ int tmp_cobo_nprocs; + if (!cobo_is_forest_opened) { + if (root == COBO_PRIMARY_TREE) { + return cobo_get_num_childs(num_forest_childs); + } else { + err_printf("Trying to use forest before cobo_open_forest"); + exit(1); + } + } + if (root == COBO_FOREST) { *num_forest_childs = cobo_num_forest_childs; return COBO_SUCCESS; @@ -1551,6 +1522,16 @@ int cobo_get_forest_parent_socket(int root, int *fd) { int num_childs; int forest_parents_fd_index; + + if (!cobo_is_forest_opened) { + if (root == COBO_PRIMARY_TREE) { + return cobo_get_parent_socket(fd); + } else { + err_printf("Trying to use forest before cobo_open_forest"); + exit(1); + } + } + if (cobo_me == root) { if (cobo_me != 0) { cobo_dbg_printf("root (tree_id=%d) does not have parent", root); @@ -1569,6 +1550,16 @@ int cobo_get_forest_parent_socket(int root, int *fd) int cobo_get_num_forest_parents(int root, int *num_parents) { + if (!cobo_is_forest_opened) { + if (root == COBO_PRIMARY_TREE) { + *num_parents = 1; + return COBO_SUCCESS; + } else { + err_printf("Trying to use forest before cobo_open_forest"); + exit(1); + } + } + if (root == COBO_FOREST) { *num_parents = cobo_num_forest_childs; } else { @@ -1579,6 +1570,16 @@ int cobo_get_num_forest_parents(int root, int *num_parents) int cobo_get_forest_parent_socket_at(int num, int *fd) { + if (!cobo_is_forest_opened) { + if (num == COBO_PRIMARY_TREE) { + *fd = cobo_parent_fd; + return COBO_SUCCESS; + } else { + err_printf("Trying to use forest before cobo_open_forest"); + exit(1); + } + } + *fd = cobo_forest_parents_fd[num]; return COBO_SUCCESS; } @@ -1792,11 +1793,7 @@ int cobo_alltoall(void* sendbuf, int sendcount, void* recvbuf) /* * Perform MPI-like Allreduce maximum of a single int from each task */ -static int cobo_allreduce() -{ - - -} +static int cobo_allreduce(){} /* * Perform MPI-like Allreduce maximum of a single int from each task @@ -1945,7 +1942,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* cobo_gettimeofday(&end); debug_printf3("Exiting cobo_init(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); - + return COBO_SUCCESS; } diff --git a/src/fe/startup/parseargs.cc b/src/fe/startup/parseargs.cc index 55c4b425..6b1753fc 100644 --- a/src/fe/startup/parseargs.cc +++ b/src/fe/startup/parseargs.cc @@ -328,7 +328,7 @@ static int parse(int key, char *arg, struct argp_state *vstate) else if (entry->key == COBO_FOREST) { int v = atoi(arg); if (v <= 0) { - argp_error(state, "Roots was given a negative or 0 value"); + argp_error(state, "'roots' was given a negative or 0 value or invalid value"); } else { spindle_roots = v; } diff --git a/src/logging/spindle_debug.h b/src/logging/spindle_debug.h index c8c796b9..1242ea8d 100644 --- a/src/logging/spindle_debug.h +++ b/src/logging/spindle_debug.h @@ -71,7 +71,7 @@ extern "C" { #endif -#define FOREST_DEBUG +//#define FOREST_DEBUG #if defined(FOREST_DEBUG) #define cobo_dbg_printf(format, ...) \ do { \ @@ -84,3 +84,4 @@ extern "C" { #endif #endif + diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index 5024f75a..1cb610eb 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -60,7 +60,7 @@ typedef void* node_peer_t; int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, unique_id_t unique_id, ldcs_process_data_t *data); /* Any further initialization (after receiving spindle command arguments) can be done here. */ -int ldcs_audit_server_md_init_post_process(); +int ldcs_audit_server_md_init_post_process(unsigned int md_roots); /* register_fd should, for every fd we want Spindle to recv messages on, call ldcs_listen_register_fd with the fd and a callback function to be triggered diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 441583b0..a9f3c5d3 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "ldcs_api.h" #include "ldcs_api_listen.h" @@ -41,7 +43,7 @@ extern unique_id_t unique_id; int cobo_rank = -1; int cobo_size = -1; -int spindle_root_count = -1; +int spindle_root_count = 1; int spindle_root_hop = -1; extern int ll_read(int fd, void *buf, size_t count); @@ -53,20 +55,14 @@ static void ldcs_audit_server_md_backtrace_print() char **strings; nptrs = backtrace(buffer, 100); - /* backtrace_symbols_fd(buffer, nptrs, STDOUT_FILENO)*/ strings = backtrace_symbols(buffer, nptrs); if (strings == NULL) { perror("backtrace_symbols"); exit(EXIT_FAILURE); } - - /* - You can translate the address to function name by - addr2line -f -e ./a.out
- */ for (j = 0; j < nptrs; j++) { - cobo_dbg_printf(" %s", strings[j]); + debug_printf(" %s", strings[j]); } free(strings); @@ -95,7 +91,6 @@ static int ldcs_audit_server_md_get_responsible_tree_id(char *path) /*TODO: Can I use message header for this if-else to find out if tree_id is for management communication or library broadcast ? */ if (path != NULL && strlen(path) > 0) { - //responsible_tree_id = spindle_root_count -1;//COBO_PRIMARY_TREE; path_dup = strdup(path); ret = stat(path_dup, &st); if ((st.st_mode & S_IFMT) == S_IFDIR) { @@ -106,12 +101,9 @@ static int ldcs_audit_server_md_get_responsible_tree_id(char *path) dir = dirname(path_dup); } responsible_tree_id = ldcs_audit_server_md_hashval(dir) % spindle_root_count; - // responsible_tree_id = ldcs_audit_server_md_hashval(path) % (spindle_root_count - 1); - // responsible_tree_id++; #ifdef LDCS_DBG cobo_dbg_printf("%s -> %d", path, responsible_tree_id); #endif - // ldcs_audit_server_md_backtrace_print(); free(path_dup); } return responsible_tree_id; @@ -163,7 +155,6 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, int i; char* env; - portlist = malloc(sizeof(unsigned int) * (num_ports + 1)); for (i = 0; i < num_ports; i++) { portlist[i] = port + i; @@ -181,6 +172,7 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, cobo_rank = data->server_stat.md_rank = data->md_rank = my_rank; cobo_size = data->server_stat.md_size = data->md_size = ranks; + spindle_root_hop = cobo_size; data->md_listen_to_parent = 0; // cobo_get_num_childs(&fanout); @@ -188,7 +180,6 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, data->server_stat.md_fan_out = data->md_fan_out = fanout; cobo_barrier(); - /* send ack about being ready */ if (data->md_rank == 0) { int root_fd, ack=13; @@ -199,26 +190,26 @@ int ldcs_audit_server_md_init(unsigned int port, unsigned int num_ports, ldcs_cobo_write_fd(root_fd, &ack, sizeof(ack)); debug_printf3("sent FE client signal that server are ready %d\n",ack); } - // - spindle_root_count = 4;//data->md_roots; - if (spindle_root_count > ranks || spindle_root_count <= 0) { - cobo_dbg_printf("spindle_root_count(%d) error", spindle_root_count); - err_printf("spindle_root_count(%d) error", spindle_root_count); - exit(1); - } - spindle_root_hop = ranks / spindle_root_count; - if (cobo_rank == 0) { - cobo_dbg_printf("root_count: %d root_hop: %d", spindle_root_count, spindle_root_hop); - } - // - + return(rc); } -int ldcs_audit_server_md_init_post_process(ldcs_process_data_t *ldcs_process_data) +int ldcs_audit_server_md_init_post_process(unsigned int md_roots) { - if (ldcs_process_data->md_roots > 1) { - return cobo_open_forest(); + if (md_roots > 1) { + spindle_root_count = md_roots; + if (spindle_root_count > cobo_size || spindle_root_count <= 0) { + err_printf("spindle_root_count(%d) error", spindle_root_count); + exit(1); + } + spindle_root_hop = cobo_size / spindle_root_count; + if (cobo_rank == 0) { + cobo_dbg_printf("root_count: %d root_hop: %d", spindle_root_count, spindle_root_hop); + } + cobo_open_forest(); + } else { + spindle_root_count = 1; + spindle_root_hop = cobo_size / spindle_root_count; } return 1; } @@ -256,7 +247,8 @@ void ldcs_audit_server_md_barrier() /* return(rc); */ /* } */ -int ldcs_audit_server_md_register_fd ( ldcs_process_data_t *ldcs_process_data ) { +int ldcs_audit_server_md_register_fd ( ldcs_process_data_t *ldcs_process_data ) +{ int rc=0, i; int parent_fd, child_fd; int num_parents, num_childs; @@ -366,36 +358,18 @@ int ldcs_audit_server_md_destroy ( ldcs_process_data_t *ldcs_process_data ) int ldcs_audit_server_md_is_responsible ( ldcs_process_data_t *ldcs_process_data, char *filename ) { - /*kento*/ - /* current implementation: only MD rank does file operations */ - // if(ldcs_process_data->md_rank % spindle_root_hop == 0) { - - // cobo_dbg_printf("is_responsible"); int responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(filename); #ifdef LDCS_DBG cobo_dbg_printf("heward: %s, rank: %d (tree_id: %d)", filename, ldcs_process_data->md_rank, responsible_tree_id); #endif - - // cobo_dbg_printf("filename: <%s>, path: <%s> => %d", filename, ldcs_process_data->md_path, responsible_tree_id); - // cobo_dbg_printf("Decided I am responsible for file <%s>: res_tree_id: %d", filename, responsible_tree_id); - /* if (responsible_tree_id != 1) { */ - /* cobo_dbg_printf("filename: <%s>, path: <%s> => %d", filename, ldcs_process_data->md_path, responsible_tree_id); */ - /* exit(1); */ - /* } */ - - // if(ldcs_process_data->md_rank == 1) { if(ldcs_process_data->md_rank == responsible_tree_id) { cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); debug_printf3("Decided I am responsible for file %s\n", filename); return 1; - } else { - // if (ldcs_process_data->md_path != NULL) { - // cobo_dbg_printf("I am not responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); - // debug_printf3("Decided I am not responsible for file %s\n", filename); - return 0; } + return 0; } int ldcs_audit_server_md_forward_query(ldcs_process_data_t *ldcs_process_data, ldcs_message_t* msg) { @@ -458,7 +432,6 @@ int ldcs_audit_server_md_recv_from_parent(ldcs_message_t *msg) { int fd; node_peer_t peer; - cobo_get_forest_parent_socket(COBO_PRIMARY_TREE, &fd); return read_msg(fd, &peer, msg); } @@ -536,7 +509,7 @@ int ldcs_audit_server_md_broadcast(ldcs_process_data_t *ldcs_process_data, ldcs_ int num_childs = 0; int responsible_tree_id = 0; - // cobo_dbg_printf("%s", __func__); + responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); #ifdef LDCS_DBG cobo_dbg_printf("dwward: %s, type: %d (tree_id: %d) bcast", ldcs_process_data->md_path, responsible_tree_id, @@ -544,7 +517,6 @@ int ldcs_audit_server_md_broadcast(ldcs_process_data_t *ldcs_process_data, ldcs_ #endif - cobo_get_num_forest_childs(responsible_tree_id, &num_childs); for (i = 0; iserver network */ ldcs_audit_server_md_init(port, num_ports, unique_id, &ldcs_process_data); + /* Use network to broadcast configuration parameters */ ldcs_message_t msg; msg.header.type = 0; msg.header.len = 0; msg.data = NULL; debug_printf2("Reading setup message from parent\n"); + result = ldcs_audit_server_md_recv_from_parent(&msg); if (result == -1) { err_printf("Error reading setup message from parent\n"); @@ -81,6 +83,7 @@ int ldcs_audit_server_network_setup(unsigned int port, unsigned int num_ports, u } assert(msg.header.type == LDCS_MSG_SETTINGS); ldcs_process_data.md_path = NULL; + result = ldcs_audit_server_md_broadcast(&ldcs_process_data, &msg); if (result == -1) { err_printf("Error broadcast setup message to children\n"); @@ -89,7 +92,6 @@ int ldcs_audit_server_network_setup(unsigned int port, unsigned int num_ports, u *packed_setup_data = msg.data; *data_size = msg.header.len; - /* Synchronize here because a library file may be tranfered via ldcs_audit_server_md_recv_from_parent if a fast process start sending files by using this tree. */ @@ -152,21 +154,19 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_process_data.serverfd = fd; ldcs_audit_server_md_register_fd(&ldcs_process_data); - /* register server listen fd to listener */ if (fd != -1) ldcs_listen_register_fd(fd, serverid, &_ldcs_server_CB, (void *) &ldcs_process_data); - debug_printf3("Initializing cache\n"); ldcs_cache_init(); return 0; } -int ldcs_audit_server_network_post_setup() +int ldcs_audit_server_network_post_setup(spindle_args_t* args) { int result; - result = ldcs_audit_server_md_init_post_process(ldcs_process_data); + result = ldcs_audit_server_md_init_post_process(args->num_roots); return result; } diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index b78912b6..41675a1e 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -138,7 +138,7 @@ typedef struct ldcs_process_data_struct ldcs_process_data_t; int ldcs_audit_server_network_setup(unsigned int port, unsigned int num_ports, unique_id_t unique_id, void **packed_setup_data, int *data_size); int ldcs_audit_server_process (spindle_args_t *args); -int ldcs_audit_server_network_post_setup(); +int ldcs_audit_server_network_post_setup(spindle_args_t* args); int ldcs_audit_server_run(); #define CLIENT_CB_AUX_FD INT32_MAX diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 88b2541b..825dd5ff 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -131,7 +131,6 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i assert(args.unique_id == unique_id); assert(args.port == port); - /* Expand environment variables in location. */ char *new_location = parse_location(args.location); if (!new_location) { @@ -142,6 +141,12 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i free(args.location); args.location = new_location; + result = ldcs_audit_server_network_post_setup(&args); + if (result == -1) { + err_printf("Error in ldcs_audit_server_network_post_setup"); + return -1; + } + result = ldcs_audit_server_process(&args); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); @@ -156,12 +161,6 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i } } - result = ldcs_audit_server_network_post_setup(); - if (result == -1) { - err_printf("Error in ldcs_audit_server_network_post_setup"); - return -1; - } - debug_printf("Setup done. Running server.\n"); ldcs_audit_server_run(); if (result == -1) { From 8259eb874fdd86261eb1f8ecf3819d05096b07e3 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 15:58:51 -0700 Subject: [PATCH 04/11] clean up --- src/cobo/ldcs_cobo.h | 1 - .../auditserver/ldcs_audit_server_filemngt.c | 13 +--- .../auditserver/ldcs_audit_server_handlers.c | 37 +-------- .../auditserver/ldcs_audit_server_process.c | 75 ------------------- 4 files changed, 5 insertions(+), 121 deletions(-) diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index 6b9ffaf3..57e600a4 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -160,7 +160,6 @@ int cobo_get_num_forest_childs(int tree_id, int* num_childs); int cobo_get_forest_parent_socket(int tree_id, int *fd); int cobo_get_forest_parent_socket_at(int num, int *fd); int cobo_get_num_forest_parents(int tree_id, int *num_parents); - int cobo_get_num_childs(int* num_childs); /* Methods to access child fds */ int cobo_get_child_socket(int num, int *fd); diff --git a/src/server/auditserver/ldcs_audit_server_filemngt.c b/src/server/auditserver/ldcs_audit_server_filemngt.c index 51767411..d25a8064 100644 --- a/src/server/auditserver/ldcs_audit_server_filemngt.c +++ b/src/server/auditserver/ldcs_audit_server_filemngt.c @@ -34,6 +34,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ldcs_api_listen.h" #include "ldcs_audit_server_process.h" #include "ldcs_audit_server_filemngt.h" +#include "ldcs_elf_read.h" #include "config.h" #if !defined(LIBEXECDIR) @@ -57,8 +58,6 @@ int ldcs_audit_server_filemngt_init (char* location) { return(rc); } -extern int read_file_and_strip(FILE *f, void *data, size_t *size); - char *filemngt_calc_localname(char *global_name) { static unsigned int unique_str_num = 0; @@ -108,15 +107,7 @@ int filemngt_read_file(char *filename, void *buffer, size_t *size, int strip) return -1; } - if (strip) { - result = read_file_and_strip(f, buffer, size); - } - else { - do { - result = fread(buffer, 1, *size, f); - } while (result == -1 && errno == EINTR); - result = (result == *size) ? 0 : -1; - } + result = read_file_and_strip(f, buffer, size, strip); if (result == -1) err_printf("Error reading from file %s: %s\n", filename, strerror(errno)); diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 4a6f4f5d..7bd5a268 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -537,18 +537,15 @@ static void *handle_setup_file_buffer(ldcs_process_data_t *procdata, char *pathn cresult = ldcs_cache_findFileDirInCache(filename, dirname, localname); if (cresult == LDCS_CACHE_FILE_FOUND && *localname) { debug_printf3("File %s was already in cache with localname %s\n", pathname, *localname); - // cobo_dbg_printf("File %s (dir: %s) was already in cache with localname %s", pathname, dirname, *localname); *already_loaded = 1; return NULL; } else if (cresult == LDCS_CACHE_FILE_FOUND) { debug_printf3("File %s was in cache, but not stored on local disk\n", pathname); - // cobo_dbg_printf("File %s was in cache, but not stored on local disk", pathname); *already_loaded = 0; } else if (cresult == LDCS_CACHE_FILE_NOT_FOUND) { debug_printf3("File %s wasn't in cache\n", pathname); - // cobo_dbg_printf("File %s wasn't in cache", pathname); ldcs_cache_addFileDir(dirname, filename); *already_loaded = 0; } @@ -633,9 +630,9 @@ static int handle_read_and_broadcast_file(ldcs_process_data_t *procdata, char *p /* Setup buffer for file contents */ buffer = handle_setup_file_buffer(procdata, pathname, size, &fd, &localname, &already_loaded); if (!buffer) { - assert(!already_loaded); - global_result = -1; - goto done; + assert(!already_loaded); + global_result = -1; + goto done; } /* Actually read the file into the buffer */ @@ -661,7 +658,6 @@ static int handle_read_and_broadcast_file(ldcs_process_data_t *procdata, char *p goto done; } - /* distribute file data */ if (bcast != suppress_broadcast) { result = handle_broadcast_file(procdata, pathname, buffer, newsize, bcast); @@ -1010,9 +1006,6 @@ static int handle_send_directory_query(ldcs_process_data_t *procdata, char *dire bytes_written = snprintf(out_msg.data, MAX_PATH_LEN+1, "D%s", directory); out_msg.header.len = bytes_written+1; procdata->md_path = directory; -#ifdef LDCS_DBG - cobo_dbg_printf("handle_send_directory_query: %s (%s)", procdata->md_path, directory); -#endif ldcs_audit_server_md_forward_query(procdata, &out_msg); procdata->md_path = NULL; return 0; @@ -1036,9 +1029,6 @@ static int handle_send_file_query(ldcs_process_data_t *procdata, char *fullpath) procdata->md_path = fullpath; -#ifdef LDCS_DBG - cobo_dbg_printf("handle_send_file_query: %s (%s)", procdata->md_path, fullpath); -#endif ldcs_audit_server_md_forward_query(procdata, &out_msg); procdata->md_path = NULL; return 0; @@ -1250,49 +1240,34 @@ int handle_server_message(ldcs_process_data_t *procdata, node_peer_t peer, ldcs_ { switch (msg->header.type) { case LDCS_MSG_CACHE_ENTRIES: - // md_cobo_dbg_printf("LDCS_MSG_CACHE_ENTRIES"); return handle_directory_recv(procdata, msg, request_broadcast); case LDCS_MSG_FILE_DATA: - // md_cobo_dbg_printf("LDCS_MSG_FILE_DATA"); return handle_file_recv(procdata, msg, peer, request_broadcast); case LDCS_MSG_FILE_REQUEST: - // md_cobo_dbg_printf("LDCS_MSG_FILE_REQUEST"); return handle_request(procdata, peer, msg); case LDCS_MSG_EXIT: - // md_cobo_dbg_printf("LDCS_MSG_EXIT"); return handle_exit_broadcast(procdata); case LDCS_MSG_PRELOAD_FILELIST: - // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_FILELIST"); return handle_preload_filelist(procdata, msg); case LDCS_MSG_PRELOAD_DIR: - // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_DIR"); return handle_directory_recv(procdata, msg, preload_broadcast); case LDCS_MSG_PRELOAD_FILE: - // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_FILE"); return handle_file_recv(procdata, msg, peer, preload_broadcast); case LDCS_MSG_PRELOAD_DONE: - // md_cobo_dbg_printf("LDCS_MSG_PRELOAD_DONE"); return handle_preload_done(procdata); case LDCS_MSG_SELFLOAD_FILE: - // md_cobo_dbg_printf("LDCS_MSG_SELFLOAD_FILE"); return handle_recv_selfload_file(procdata, msg); case LDCS_MSG_STAT_NET_RESULT: - // md_cobo_dbg_printf("LDCS_MSG_STAT_NET_RESULT"); return handle_metadata_recv(procdata, msg, metadata_stat, peer); case LDCS_MSG_STAT_NET_REQUEST: - // md_cobo_dbg_printf("LDCS_MSG_STAT_NET_REQUEST"); return handle_metadata_request_recv(procdata, msg, metadata_stat, peer); case LDCS_MSG_LOADER_DATA_NET_RESP: - // md_cobo_dbg_printf("LDCS_MSG_LOADER_DATA_NET_RESP"); return handle_metadata_recv(procdata, msg, metadata_loader, peer); case LDCS_MSG_LOADER_DATA_NET_REQ: - // md_cobo_dbg_printf("LDCS_MSG_LOADER_DATA_NET_REQ"); return handle_metadata_request_recv(procdata, msg, metadata_loader, peer); case LDCS_MSG_EXIT_READY: - // md_cobo_dbg_printf("LDCS_MSG_EXIT_READY"); return handle_exit_ready_msg(procdata, msg); case LDCS_MSG_EXIT_CANCEL: - // md_cobo_dbg_printf("LDCS_MSG_EXIT_CANCEL"); return handle_exit_cancel_msg(procdata, msg); default: err_printf("Received unexpected message from node: %d\n", (int) msg->header.type); @@ -1916,9 +1891,6 @@ static int handle_metadata_request(ldcs_process_data_t *procdata, char *pathname msg.data = pathname; procdata->md_path = pathname; -#ifdef LDCS_DBG - cobo_dbg_printf("handle_metadata_request: %s", procdata->md_path); -#endif ret = ldcs_audit_server_md_forward_query(procdata, &msg); procdata->md_path = NULL; return ret; @@ -2044,9 +2016,6 @@ static int handle_send_exit_ready_if_done(ldcs_process_data_t *procdata) else { debug_printf2("Sending exit ready message to parent\n"); procdata->md_path = NULL; -#ifdef LDCS_DBG - cobo_dbg_printf("handle_send_exit_ready_if_done: %s", procdata->md_path); -#endif return ldcs_audit_server_md_forward_query(procdata, &msg); } } diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index ffa1090d..2ad908e2 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -233,16 +233,6 @@ int ldcs_audit_server_run() /* Statistic functions */ int _ldcs_server_stat_print ( ldcs_server_stat_t *server_stat ) { int rc=0; - /* cobo_dbg_printf("SERVER[%02d] STAT: #conn=%2d md_size=%2d md_fan_out=%2d listen_time=%8.4f select_time=%8.4f ts_first_connect=%16.6f hostname=%s", */ - /* server_stat->md_rank, */ - /* server_stat->num_connections, */ - /* server_stat->md_size, */ - /* server_stat->md_fan_out, */ - /* server_stat->listen_time, */ - /* server_stat->select_time, */ - /* server_stat->starttime, */ - /* server_stat->hostname ); */ - debug_printf("SERVER[%02d] STAT: #conn=%2d md_size=%2d md_fan_out=%2d listen_time=%8.4f select_time=%8.4f ts_first_connect=%16.6f hostname=%s\n", server_stat->md_rank, server_stat->num_connections, @@ -255,119 +245,54 @@ int ldcs_audit_server_run() #define MYFORMAT "SERVER[%02d] STAT: %-10s, #cnt=%5d, bytes=%8.2f MB, time=%8.4f sec\n" - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"libread", */ - /* server_stat->libread.cnt, */ - /* server_stat->libread.bytes/1024.0/1024.0, */ - /* server_stat->libread.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"libstore", */ - /* server_stat->libstore.cnt, */ - /* server_stat->libstore.bytes/1024.0/1024.0, */ - /* server_stat->libstore.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"libdist", */ - /* server_stat->libdist.cnt, */ - /* server_stat->libdist.bytes/1024.0/1024.0, */ - /* server_stat->libdist.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"procdir", */ - /* server_stat->procdir.cnt, */ - /* server_stat->procdir.bytes/1024.0/1024.0, */ - /* server_stat->procdir.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"distdir", */ - /* server_stat->distdir.cnt, */ - /* server_stat->distdir.bytes/1024.0/1024.0, */ - /* server_stat->distdir.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"client_cb", */ - /* server_stat->client_cb.cnt, */ - /* server_stat->client_cb.bytes/1024.0/1024.0, */ - /* server_stat->client_cb.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"server_cb", */ - /* server_stat->server_cb.cnt, */ - /* server_stat->server_cb.bytes/1024.0/1024.0, */ - /* server_stat->server_cb.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"md_cb", */ - /* server_stat->md_cb.cnt, */ - /* server_stat->md_cb.bytes/1024.0/1024.0, */ - /* server_stat->md_cb.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"cl_msg_avg", */ - /* server_stat->clientmsg.cnt/((server_stat->num_connections>0)?server_stat->num_connections:1), */ - /* server_stat->clientmsg.bytes/1024.0/1024.0, */ - /* server_stat->clientmsg.time/((server_stat->num_connections>0)?server_stat->num_connections:1) ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"bcast", */ - /* server_stat->bcast.cnt, */ - /* server_stat->bcast.bytes/1024.0/1024.0, */ - /* server_stat->bcast.time ); */ - /* cobo_dbg_printf(MYFORMAT, */ - /* server_stat->md_rank,"preload_cb", */ - /* server_stat->preload.cnt, */ - /* server_stat->preload.bytes/1024.0/1024.0, */ - /* server_stat->preload.time ); */ - - debug_printf(MYFORMAT, server_stat->md_rank,"libread", server_stat->libread.cnt, server_stat->libread.bytes/1024.0/1024.0, server_stat->libread.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"libstore", server_stat->libstore.cnt, server_stat->libstore.bytes/1024.0/1024.0, server_stat->libstore.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"libdist", server_stat->libdist.cnt, server_stat->libdist.bytes/1024.0/1024.0, server_stat->libdist.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"procdir", server_stat->procdir.cnt, server_stat->procdir.bytes/1024.0/1024.0, server_stat->procdir.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"distdir", server_stat->distdir.cnt, server_stat->distdir.bytes/1024.0/1024.0, server_stat->distdir.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"client_cb", server_stat->client_cb.cnt, server_stat->client_cb.bytes/1024.0/1024.0, server_stat->client_cb.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"server_cb", server_stat->server_cb.cnt, server_stat->server_cb.bytes/1024.0/1024.0, server_stat->server_cb.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"md_cb", server_stat->md_cb.cnt, server_stat->md_cb.bytes/1024.0/1024.0, server_stat->md_cb.time ); - debug_printf(MYFORMAT, server_stat->md_rank,"cl_msg_avg", server_stat->clientmsg.cnt/((server_stat->num_connections>0)?server_stat->num_connections:1), From 9204bcf1d665ca4fd1fadd7069f418dcef129798 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 15:59:40 -0700 Subject: [PATCH 05/11] Added runTests_forest --- testsuite/runTests_forest | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100755 testsuite/runTests_forest diff --git a/testsuite/runTests_forest b/testsuite/runTests_forest new file mode 100755 index 00000000..8d5019bf --- /dev/null +++ b/testsuite/runTests_forest @@ -0,0 +1,57 @@ +#!/bin/sh + +num_roots=$1 + +./run_driver --dependency --push --roots $num_roots +./run_driver --dlopen --push --roots $num_roots +./run_driver --dlreopen --push --roots $num_roots +./run_driver --reorder --push --roots $num_roots +./run_driver --partial --push --roots $num_roots +./run_driver --ldpreload --push --roots $num_roots + +if false; then +./run_driver --dependency --pull --roots $num_roots +./run_driver --dlopen --pull --roots $num_roots +./run_driver --dlreopen --pull --roots $num_roots +./run_driver --reorder --pull --roots $num_roots +./run_driver --partial --pull --roots $num_roots +./run_driver --ldpreload --pull --roots $num_roots + +if test "x$SPINDLE_BLUEGENE" != "xtrue"; then +./run_driver --dependency --fork --roots $num_roots +./run_driver --dlopen --fork --roots $num_roots +./run_driver --dlreopen --fork --roots $num_roots +./run_driver --reorder --fork --roots $num_roots +./run_driver --partial --fork --roots $num_roots +./run_driver --ldpreload --fork --roots $num_roots +fi + +if test "x$SPINDLE_BLUEGENE" != "xtrue"; then +./run_driver --dependency --forkexec --roots $num_roots +./run_driver --dlopen --forkexec --roots $num_roots +./run_driver --dlreopen --forkexec --roots $num_roots +./run_driver --reorder --forkexec --roots $num_roots +./run_driver --partial --forkexec --roots $num_roots +./run_driver --ldpreload --forkexec --roots $num_roots +fi + +./run_driver --dependency --chdir --roots $num_roots +./run_driver --dlopen --chdir --roots $num_roots +./run_driver --dlreopen --chdir --roots $num_roots +./run_driver --reorder --chdir --roots $num_roots +./run_driver --partial --chdir --roots $num_roots +./run_driver --ldpreload --chdir--roots $num_roots + +./run_driver --dependency --preload --roots $num_roots +./run_driver --dlopen --preload --roots $num_roots +./run_driver --dlreopen --preload --roots $num_roots +./run_driver --reorder --preload --roots $num_roots +./run_driver --partial --preload --roots $num_roots +./run_driver --ldpreload --preload --roots $num_roots +fi + +#sleep 10 +#handler + +echo "Done." + From b02c26b9e445bcbe8449e5bbf45181ec4dbb4ae2 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 16:01:34 -0700 Subject: [PATCH 06/11] clean up --- src/cobo/cobo.c | 138 +----------------------------------------------- 1 file changed, 1 insertion(+), 137 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index e4e9f88d..46a347bd 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -975,147 +975,11 @@ static int cobo_open_tree() { int sockfd; int i = 0; -#if 1 + sockfd = cobo_create_socket(); cobo_bind_and_listen(sockfd); cobo_parent_fd = cobo_accept_and_handshake(sockfd); close(sockfd); -#else - /* create a socket to accept connection from parent IPPROTO_TCP */ - sockfd = socket(AF_INET, SOCK_STREAM, 0); - if (sockfd < 0) { - err_printf("Creating parent socket (socket() %m errno=%d)\n", - errno); - exit(1); - } - - /* TODO: could recycle over port numbers, trying to bind to one for some time */ - /* try to bind the socket to one the ports in our allowed range */ - - int port_is_bound = 0; - while (i < cobo_num_ports && !port_is_bound) { - /* pick a port */ - int port = cobo_ports[i]; - i++; - - /* set up an address using our selected port */ - struct sockaddr_in sin; - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(port); - - /* attempt to bind a socket on this port */ - if (bind(sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { - - debug_printf3("Binding parent socket (bind() %m errno=%d) port=%d\n", - errno, port); - continue; - } - - /* set the socket to listen for connections */ - if (listen(sockfd, 1) < 0) { - debug_printf3("Setting parent socket to listen (listen() %m errno=%d) port=%d\n", - errno, port); - continue; - } - - /* bound and listening on our port */ - debug_printf3("Opened socket on port %d\n", port); - port_is_bound = 1; - } - - /* failed to bind to a port, this is fatal */ - if (!port_is_bound) { - /* TODO: would like to send an abort back to server */ - err_printf("Failed to open socket on any port\n"); - exit(1); - } - - - /* accept a connection from parent and receive socket table */ - int reply_timeout = cobo_connect_timeout * 100; - int have_parent = 0; - while (!have_parent) { - struct sockaddr parent_addr; - socklen_t parent_len = sizeof(parent_addr); - cobo_parent_fd = accept(sockfd, (struct sockaddr *) &parent_addr, &parent_len); - - _cobo_opt_socket(sockfd); - - /* handshake/authenticate our connection to make sure it one of our processes */ - int result = spindle_handshake_server(cobo_parent_fd, &cobo_handshake, cobo_sessionid); - switch (result) { - case HSHAKE_SUCCESS: - break; - case HSHAKE_INTERNAL_ERROR: - err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); - break; - case HSHAKE_DROP_CONNECTION: - debug_printf3("Handshake said to drop connection\n"); - close(cobo_parent_fd); - continue; - case HSHAKE_ABORT: - handle_security_error(spindle_handshake_last_error_str()); - abort(); - default: - assert(0 && "Unknown return value from handshake_server\n"); - } - - /* read the service id */ - unsigned int received_serviceid = 0; - if (cobo_read_fd_w_timeout(cobo_parent_fd, &received_serviceid, sizeof(received_serviceid), reply_timeout) < 0) { - debug_printf3("Receiving service id from new connection failed\n"); - close(cobo_parent_fd); - continue; - } - - /* read the session id */ - uint64_t received_sessionid = 0; - if (cobo_read_fd_w_timeout(cobo_parent_fd, &received_sessionid, sizeof(received_sessionid), reply_timeout) < 0) { - debug_printf3("Receiving session id from new connection failed\n"); - close(cobo_parent_fd); - continue; - } - - /* check that we got the expected sesrive and session ids */ - /* TODO: reply with some sort of error message if no match? */ - if (received_serviceid != cobo_serviceid || received_sessionid != cobo_sessionid) { - close(cobo_parent_fd); - continue; - } - - /* write our service id back as a reply */ - if (cobo_write_fd_w_suppress(cobo_parent_fd, &cobo_serviceid, sizeof(cobo_serviceid), 1) < 0) { - debug_printf3("Writing service id to new connection failed\n"); - close(cobo_parent_fd); - continue; - } - - /* write our accept id back as a reply */ - if (cobo_write_fd_w_suppress(cobo_parent_fd, &cobo_acceptid, sizeof(cobo_acceptid), 1) < 0) { - debug_printf3("Writing accept id to new connection failed\n"); - close(cobo_parent_fd); - continue; - } - - /* our parent may have dropped us if he was too impatient waiting for our reply, - * read his ack to know that he completed the connection */ - unsigned int ack = 0; - if (cobo_read_fd_w_timeout(cobo_parent_fd, &ack, sizeof(ack), reply_timeout) < 0) { - debug_printf3("Receiving ack to finalize connection\n"); - close(cobo_parent_fd); - continue; - } - - /* if we get here, we've got a good connection to our parent */ - have_parent = 1; - } - - /* we've got the connection to our parent, so close the listening socket */ - close(sockfd); -#endif cobo_gettimeofday(&tree_start); From d1acad49e4f98bc1e5f125144c3d34de411c5501 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 16:06:57 -0700 Subject: [PATCH 07/11] clean up --- src/cobo/cobo.c | 8 ++- .../auditserver/ldcs_audit_server_md_cobo.c | 72 +------------------ 2 files changed, 6 insertions(+), 74 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 46a347bd..5fa58024 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -150,9 +150,11 @@ static double cobo_getsecs(struct timeval* tv2, struct timeval* tv1) /* Fills in timeval via gettimeofday */ static void cobo_gettimeofday(struct timeval* tv) { +#if 0 if (gettimeofday(tv, NULL) < 0) { err_printf("Getting time (gettimeofday() %m errno=%d)\n", errno); } +#endif } /* Reads environment variable, bails if not set */ @@ -1827,9 +1829,9 @@ int cobo_close() debug_printf3("Exiting cobo_close(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); debug_printf3("Total time from cobo_open() to cobo_close() took %f seconds for %d procs\n", cobo_getsecs(&time_close, &time_open), cobo_nprocs); - if (cobo_me == 0) { - cobo_dbg_printf("Total time: %f seconds (%d procs)", cobo_getsecs(&time_close, &time_open), cobo_nprocs); - } + /* if (cobo_me == 0) { */ + /* cobo_dbg_printf("Total time: %f seconds (%d procs)", cobo_getsecs(&time_close, &time_open), cobo_nprocs); */ + /* } */ return COBO_SUCCESS; } diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index a9f3c5d3..209ea805 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -101,9 +101,6 @@ static int ldcs_audit_server_md_get_responsible_tree_id(char *path) dir = dirname(path_dup); } responsible_tree_id = ldcs_audit_server_md_hashval(dir) % spindle_root_count; -#ifdef LDCS_DBG - cobo_dbg_printf("%s -> %d", path, responsible_tree_id); -#endif free(path_dup); } return responsible_tree_id; @@ -221,32 +218,6 @@ void ldcs_audit_server_md_barrier() } -/* int ldcs_audit_server_md_register_fd ( ldcs_process_data_t *ldcs_process_data ) { */ -/* int rc=0, i; */ -/* int parent_fd, child_fd; */ -/* int num_childs; */ - -/* /\* Registering parents *\/ */ -/* if(cobo_get_parent_socket(&parent_fd)!=COBO_SUCCESS) { */ -/* err_printf("Error, could not get parent socket\n"); */ -/* assert(0); */ -/* } */ - -/* debug_printf3("Registering fd %d for cobo parent connection\n",parent_fd); */ -/* ldcs_listen_register_fd(parent_fd, 0, &ldcs_audit_server_md_cobo_CB, (void *) ldcs_process_data); */ -/* ldcs_process_data->md_listen_to_parent=1; */ - -/* /\* Registering childs *\/ */ -/* cobo_get_num_childs(&num_childs); */ -/* for (i = 0; imd_listen_to_parent) { */ -/* if(cobo_get_parent_socket(&parent_fd)!=COBO_SUCCESS) { */ -/* _error("cobo internal error (parent socket)"); */ -/* } */ -/* ldcs_process_data->md_listen_to_parent=0; */ -/* ldcs_listen_unregister_fd(parent_fd); */ - -/* cobo_get_num_childs(&num_childs); */ -/* for (i = 0; imd_rank, responsible_tree_id); -#endif - if(ldcs_process_data->md_rank == responsible_tree_id) { cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); debug_printf3("Decided I am responsible for file %s\n", filename); @@ -377,11 +325,7 @@ int ldcs_audit_server_md_forward_query(ldcs_process_data_t *ldcs_process_data, l int result; int responsible_tree_id; - // cobo_dbg_printf("%s", __func__); responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); -#ifdef LDCS_DBG - cobo_dbg_printf("upward: %s (tree_id: %d)", ldcs_process_data->md_path, responsible_tree_id); -#endif if (ldcs_process_data->md_rank == responsible_tree_id) { /* We're root--no one to forward a query to*/ @@ -509,13 +453,7 @@ int ldcs_audit_server_md_broadcast(ldcs_process_data_t *ldcs_process_data, ldcs_ int num_childs = 0; int responsible_tree_id = 0; - responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); -#ifdef LDCS_DBG - cobo_dbg_printf("dwward: %s, type: %d (tree_id: %d) bcast", ldcs_process_data->md_path, responsible_tree_id, - msg->header.type); -#endif - cobo_get_num_forest_childs(responsible_tree_id, &num_childs); for (i = 0; imd_path); -#endif return ldcs_audit_server_md_broadcast(ldcs_process_data, msg); } - // cobo_dbg_printf("%s", __func__); -#ifdef LDCS_DBG - cobo_dbg_printf("dwward: %s, type: %d (tree_id: %d) bcas noncontigt", ldcs_process_data->md_path, responsible_tree_id, - msg->header.type); -#endif responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(ldcs_process_data->md_path); cobo_get_num_forest_childs(responsible_tree_id, &num_childs); From 6ade9f9494ac430eb0cef448b1068af478fe10e2 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Fri, 19 May 2017 16:15:01 -0700 Subject: [PATCH 08/11] clean up --- src/cobo/cobo.c | 20 +++++++++++++++++++ .../auditserver/ldcs_audit_server_md_cobo.c | 1 - 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 5fa58024..5471cca9 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1137,6 +1137,23 @@ int cobo_open_forest() return COBO_SUCCESS; } + +/* open socket forest */ +int cobo_close_forest() +{ + int i; + for (i = 0; i < cobo_num_forest_childs; i++) { + close(cobo_forest_childs_fd[i]); + close(cobo_forest_parents_fd[i]); + } + cobo_is_forest_opened = 0; + cobo_free(cobo_forest_childs); + cobo_free(cobo_forest_parents); + cobo_free(cobo_forest_childs_fd); + cobo_free(cobo_forest_parents_fd); + return COBO_SUCCESS; +} + /* * ============================= * Functions to bcast/gather/scatter with root as rank 0 using the TCP/socket tree. @@ -1820,6 +1837,9 @@ int cobo_close() debug_printf3("Starting cobo_close()"); /* shut down the tree */ cobo_close_tree(); + if (cobo_is_forest_opened) { + cobo_close_forest(); + } /* free our data structures */ cobo_free(cobo_ports); diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 209ea805..c93bf2e0 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -302,7 +302,6 @@ int ldcs_audit_server_md_destroy ( ldcs_process_data_t *ldcs_process_data ) /*TODO: close cobo for time measurement, but remove it*/ if (cobo_close() != COBO_SUCCESS) { debug_printf3("Failed to close\n"); - printf("Failed to close\n"); exit(1); } return 0; From b85f382f0a2bf831d65de6ed947f11c6bfd380f3 Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Mon, 22 May 2017 13:49:37 -0700 Subject: [PATCH 09/11] minor update --- src/cobo/cobo.c | 39 +++---------------- src/cobo/ldcs_cobo.h | 2 + src/logging/spindle_debug.h | 5 ++- .../auditserver/ldcs_audit_server_md_cobo.c | 6 ++- testsuite/runTests_forest | 3 ++ 5 files changed, 20 insertions(+), 35 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 5471cca9..453294b3 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -55,6 +55,8 @@ Place, Suite 330, Boston, MA 02111-1307 USA #define COBO_CONNECT_TIMELIMIT (600) /* seconds -- wait this long before giving up for good */ #endif +#define ENABLE_HANDSHAKE + #if defined(_IA64_) #undef htons #undef ntohs @@ -1467,9 +1469,6 @@ int cobo_get_forest_parent_socket_at(int num, int *fd) return COBO_SUCCESS; } -#define COBO_FOREST - -#ifdef COBO_FOREST int cobo_get_child_socket(int num, int *fd) { if (cobo_is_forest_opened) { @@ -1480,16 +1479,7 @@ int cobo_get_child_socket(int num, int *fd) } return COBO_SUCCESS; } -#else -int cobo_get_child_socket(int num, int *fd) -{ - assert(num < cobo_num_child); - *fd = cobo_child_fd[num]; - return COBO_SUCCESS; -} -#endif -#ifdef COBO_FOREST int cobo_get_num_childs(int* num_childs) { if (cobo_is_forest_opened) { cobo_get_num_forest_childs(0, num_childs); @@ -1498,20 +1488,12 @@ int cobo_get_num_childs(int* num_childs) { } return COBO_SUCCESS; } -#else -int cobo_get_num_childs(int* num_childs) { - *num_childs=cobo_num_child; - return COBO_SUCCESS; -} -#endif - /* fills in fd with socket file desriptor to our parent */ /* TODO: the upside here is that the upper layer can directly use our * communication tree, but the downside is that it exposes the implementation * and forces sockets */ -#ifdef COBO_FOREST int cobo_get_parent_socket(int* fd) { if (cobo_is_forest_opened){ @@ -1525,17 +1507,6 @@ int cobo_get_parent_socket(int* fd) } return COBO_SUCCESS; } -#else -int cobo_get_parent_socket(int* fd) -{ - if (cobo_parent_fd != -1) { - *fd = cobo_parent_fd; - return COBO_SUCCESS; - } - return -1; /* failure RCs? */ -} - -#endif /* Perform barrier, each task writes an int then waits for an int */ int cobo_barrier() @@ -1835,11 +1806,13 @@ int cobo_close() struct timeval start, end; cobo_gettimeofday(&start); debug_printf3("Starting cobo_close()"); - /* shut down the tree */ - cobo_close_tree(); + + /* shut doen the forest*/ if (cobo_is_forest_opened) { cobo_close_forest(); } + /* shut down the tree */ + cobo_close_tree(); /* free our data structures */ cobo_free(cobo_ports); diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index 57e600a4..2dd01531 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -41,6 +41,8 @@ extern "C" { #define COBO_SUCCESS (0) #define COBO_PRIMARY_TREE (0) #define COBO_FOREST (-1) +#define COBO_ALL_PARENTS (-2) +#define COBO_ALL_CHILDS (-3) #define COBO_NAMESPACE ldcs diff --git a/src/logging/spindle_debug.h b/src/logging/spindle_debug.h index 1242ea8d..48fbd680 100644 --- a/src/logging/spindle_debug.h +++ b/src/logging/spindle_debug.h @@ -71,7 +71,7 @@ extern "C" { #endif -//#define FOREST_DEBUG +#define FOREST_DEBUG #if defined(FOREST_DEBUG) #define cobo_dbg_printf(format, ...) \ do { \ @@ -85,3 +85,6 @@ extern "C" { #endif + +#define err_printf cobo_dbg_printf + diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index c93bf2e0..7d527de5 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -225,7 +225,9 @@ int ldcs_audit_server_md_register_fd ( ldcs_process_data_t *ldcs_process_data ) int num_parents, num_childs; /* Registering parents */ + cobo_dbg_printf("before 1"); cobo_get_num_forest_parents(COBO_FOREST, &num_parents); + cobo_dbg_printf("after 1"); for (i = 0; i < num_parents; i++) { if(cobo_get_forest_parent_socket_at(i, &parent_fd)!=COBO_SUCCESS) { err_printf("Error, could not get parent socket\n"); @@ -265,7 +267,9 @@ int ldcs_audit_server_md_unregister_fd ( ldcs_process_data_t *ldcs_process_data if (!ldcs_process_data->md_listen_to_parent) return rc; /* Registering parents */ + cobo_dbg_printf("before 2"); cobo_get_num_forest_parents(COBO_FOREST, &num_parents); + cobo_dbg_printf("before 3"); for (i = 0; i < num_parents; i++) { if(cobo_get_forest_parent_socket_at(i, &parent_fd)!=COBO_SUCCESS) { err_printf("Error, could not get parent socket\n"); @@ -312,7 +316,7 @@ int ldcs_audit_server_md_is_responsible ( ldcs_process_data_t *ldcs_process_data int responsible_tree_id = ldcs_audit_server_md_get_responsible_tree_id(filename); if(ldcs_process_data->md_rank == responsible_tree_id) { - cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); + // cobo_dbg_printf("I am responsible for file: %s (tree_id: %d)", filename, responsible_tree_id); debug_printf3("Decided I am responsible for file %s\n", filename); return 1; } diff --git a/testsuite/runTests_forest b/testsuite/runTests_forest index 8d5019bf..d56d26aa 100755 --- a/testsuite/runTests_forest +++ b/testsuite/runTests_forest @@ -2,7 +2,10 @@ num_roots=$1 +./run_driver --dependency --push +exit ./run_driver --dependency --push --roots $num_roots +exit ./run_driver --dlopen --push --roots $num_roots ./run_driver --dlreopen --push --roots $num_roots ./run_driver --reorder --push --roots $num_roots From fd6310adbb20c4f98f72cad40ee53e0356450e7f Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Mon, 22 May 2017 16:12:56 -0700 Subject: [PATCH 10/11] commount out FOREST_DEBUG --- src/cobo/cobo.c | 142 +++++++++--------- src/logging/spindle_debug.h | 6 +- .../auditserver/ldcs_audit_server_md_cobo.c | 5 - testsuite/runTests_forest | 13 +- 4 files changed, 75 insertions(+), 91 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 453294b3..fca39c51 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -956,7 +956,6 @@ static int cobo_accept_and_handshake(int sockfd) /* if we get here, we've got a good connection to our parent */ have_parent = 1; } - // cobo_dbg_printf("handhsare done (rank: %d)", cobo_me); return accepted_sockfd; } @@ -1343,16 +1342,14 @@ int cobo_get_num_tree(int *num_trees) int cobo_get_forest_child_socket(int root, int num, int *fd) { int num_forest_childs; - - if (!cobo_is_forest_opened) { - if (root == COBO_PRIMARY_TREE) { - return cobo_get_child_socket(num, fd); - } else { - err_printf("Trying to use forest before cobo_open_forest"); - exit(1); - } - } - + /* if (!cobo_is_forest_opened) { */ + /* if (root == COBO_PRIMARY_TREE) { */ + /* return cobo_get_child_socket(num, fd); */ + /* } else { */ + /* err_printf("Trying to use forest before cobo_open_forest"); */ + /* exit(1); */ + /* } */ + /* } */ if (root == COBO_FOREST) { *fd = cobo_forest_childs_fd[num]; return COBO_SUCCESS; @@ -1372,16 +1369,14 @@ int cobo_get_num_forest_childs(int root, int* num_forest_childs) int num_childs = 0; int logical_rank = 0; /*logical_rank of the root is 0*/ int tmp_cobo_nprocs; - - if (!cobo_is_forest_opened) { - if (root == COBO_PRIMARY_TREE) { - return cobo_get_num_childs(num_forest_childs); - } else { - err_printf("Trying to use forest before cobo_open_forest"); - exit(1); - } - } - + /* if (!cobo_is_forest_opened) { */ + /* if (root == COBO_PRIMARY_TREE) { */ + /* return cobo_get_num_childs(num_forest_childs); */ + /* } else { */ + /* err_printf("Trying to use forest before cobo_open_forest"); */ + /* exit(1); */ + /* } */ + /* } */ if (root == COBO_FOREST) { *num_forest_childs = cobo_num_forest_childs; return COBO_SUCCESS; @@ -1407,44 +1402,39 @@ int cobo_get_forest_parent_socket(int root, int *fd) { int num_childs; int forest_parents_fd_index; - - if (!cobo_is_forest_opened) { - if (root == COBO_PRIMARY_TREE) { - return cobo_get_parent_socket(fd); - } else { - err_printf("Trying to use forest before cobo_open_forest"); - exit(1); - } - } - + /* if (!cobo_is_forest_opened) { */ + /* if (root == COBO_PRIMARY_TREE) { */ + /* return cobo_get_parent_socket(fd); */ + /* } else { */ + /* err_printf("Trying to use forest before cobo_open_forest"); */ + /* exit(1); */ + /* } */ + /* } */ if (cobo_me == root) { if (cobo_me != 0) { cobo_dbg_printf("root (tree_id=%d) does not have parent", root); exit(1); } *fd = cobo_parent_fd; - // fprintf(stderr, "cobo_index: %d\n", -1); } else { cobo_get_num_forest_childs(root, &num_childs); forest_parents_fd_index = num_childs; *fd = cobo_forest_parents_fd[forest_parents_fd_index]; - // fprintf(stderr, "cobo_index: %d\n", forest_parents_fd_index); } return COBO_SUCCESS; } int cobo_get_num_forest_parents(int root, int *num_parents) { - if (!cobo_is_forest_opened) { - if (root == COBO_PRIMARY_TREE) { - *num_parents = 1; - return COBO_SUCCESS; - } else { - err_printf("Trying to use forest before cobo_open_forest"); - exit(1); - } - } - + /* if (!cobo_is_forest_opened) { */ + /* if (root == COBO_PRIMARY_TREE) { */ + /* *num_parents = 1; */ + /* return COBO_SUCCESS; */ + /* } else { */ + /* err_printf("Trying to use forest before cobo_open_forest"); */ + /* exit(1); */ + /* } */ + /* } */ if (root == COBO_FOREST) { *num_parents = cobo_num_forest_childs; } else { @@ -1455,37 +1445,38 @@ int cobo_get_num_forest_parents(int root, int *num_parents) int cobo_get_forest_parent_socket_at(int num, int *fd) { - if (!cobo_is_forest_opened) { - if (num == COBO_PRIMARY_TREE) { - *fd = cobo_parent_fd; - return COBO_SUCCESS; - } else { - err_printf("Trying to use forest before cobo_open_forest"); - exit(1); - } - } - + /* if (!cobo_is_forest_opened) { */ + /* if (num == COBO_PRIMARY_TREE) { */ + /* *fd = cobo_parent_fd; */ + /* return COBO_SUCCESS; */ + /* } else { */ + /* err_printf("Trying to use forest before cobo_open_forest"); */ + /* exit(1); */ + /* } */ + /* } */ *fd = cobo_forest_parents_fd[num]; return COBO_SUCCESS; } int cobo_get_child_socket(int num, int *fd) { - if (cobo_is_forest_opened) { - cobo_get_forest_child_socket(0, num, fd); - } else { - assert(num < cobo_num_child); - *fd = cobo_child_fd[num]; - } + /* if (cobo_is_forest_opened) { */ + /* cobo_get_forest_child_socket(0, num, fd); */ + /* } else { */ + /* assert(num < cobo_num_child); */ + /* *fd = cobo_child_fd[num]; */ + /* } */ + cobo_get_forest_child_socket(0, num, fd); return COBO_SUCCESS; } int cobo_get_num_childs(int* num_childs) { - if (cobo_is_forest_opened) { - cobo_get_num_forest_childs(0, num_childs); - } else { - *num_childs=cobo_num_child; - } + /* if (cobo_is_forest_opened) { */ + /* cobo_get_num_forest_childs(0, num_childs); */ + /* } else { */ + /* *num_childs=cobo_num_child; */ + /* } */ + cobo_get_num_forest_childs(0, num_childs); return COBO_SUCCESS; } @@ -1496,15 +1487,16 @@ int cobo_get_num_childs(int* num_childs) { * and forces sockets */ int cobo_get_parent_socket(int* fd) { - if (cobo_is_forest_opened){ - cobo_get_forest_parent_socket(0, fd); - } else { - if (cobo_parent_fd != -1) { - *fd = cobo_parent_fd; - return COBO_SUCCESS; - } - return -1; /* failure RCs? */ - } + /* if (cobo_is_forest_opened){ */ + /* cobo_get_forest_parent_socket(0, fd); */ + /* } else { */ + /* if (cobo_parent_fd != -1) { */ + /* *fd = cobo_parent_fd; */ + /* return COBO_SUCCESS; */ + /* } */ + /* return -1; /\* failure RCs? *\/ */ + /* } */ + cobo_get_forest_parent_socket(0, fd); return COBO_SUCCESS; } @@ -1778,13 +1770,15 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* /* open the tree */ cobo_open_tree(); - /* need to check that tree opened successfully before returning, so do a barrier */ if (cobo_barrier() != COBO_SUCCESS) { err_printf("Failed to open tree\n"); exit(1); } + /* open the forest */ + cobo_open_forest(); + if (cobo_me == 0) { cobo_gettimeofday(&tree_end); debug_printf3("Exiting cobo_close(), took %f seconds for %d procs\n", cobo_getsecs(&tree_end,&tree_start), cobo_nprocs); diff --git a/src/logging/spindle_debug.h b/src/logging/spindle_debug.h index 48fbd680..653c4ba1 100644 --- a/src/logging/spindle_debug.h +++ b/src/logging/spindle_debug.h @@ -71,13 +71,14 @@ extern "C" { #endif -#define FOREST_DEBUG +//#define FOREST_DEBUG #if defined(FOREST_DEBUG) #define cobo_dbg_printf(format, ...) \ do { \ fprintf(stderr, "COBO:%6d: " format " (%s:%d)\n", getpid(), ## __VA_ARGS__, __FILE__, __LINE__); \ } while (0) - +#undef err_printf +#define err_printf cobo_dbg_printf #else #define cobo_dbg_printf(format, ...) #define md_cobo_dbg_printf(format, ...) @@ -86,5 +87,4 @@ extern "C" { #endif -#define err_printf cobo_dbg_printf diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 7d527de5..f415f339 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -203,7 +203,6 @@ int ldcs_audit_server_md_init_post_process(unsigned int md_roots) if (cobo_rank == 0) { cobo_dbg_printf("root_count: %d root_hop: %d", spindle_root_count, spindle_root_hop); } - cobo_open_forest(); } else { spindle_root_count = 1; spindle_root_hop = cobo_size / spindle_root_count; @@ -225,9 +224,7 @@ int ldcs_audit_server_md_register_fd ( ldcs_process_data_t *ldcs_process_data ) int num_parents, num_childs; /* Registering parents */ - cobo_dbg_printf("before 1"); cobo_get_num_forest_parents(COBO_FOREST, &num_parents); - cobo_dbg_printf("after 1"); for (i = 0; i < num_parents; i++) { if(cobo_get_forest_parent_socket_at(i, &parent_fd)!=COBO_SUCCESS) { err_printf("Error, could not get parent socket\n"); @@ -267,9 +264,7 @@ int ldcs_audit_server_md_unregister_fd ( ldcs_process_data_t *ldcs_process_data if (!ldcs_process_data->md_listen_to_parent) return rc; /* Registering parents */ - cobo_dbg_printf("before 2"); cobo_get_num_forest_parents(COBO_FOREST, &num_parents); - cobo_dbg_printf("before 3"); for (i = 0; i < num_parents; i++) { if(cobo_get_forest_parent_socket_at(i, &parent_fd)!=COBO_SUCCESS) { err_printf("Error, could not get parent socket\n"); diff --git a/testsuite/runTests_forest b/testsuite/runTests_forest index d56d26aa..69021942 100755 --- a/testsuite/runTests_forest +++ b/testsuite/runTests_forest @@ -1,18 +1,16 @@ #!/bin/sh +#export SPINDLE_TEST_ARGS="$@" +export SPINDLE_BLUEGENE="false" num_roots=$1 -./run_driver --dependency --push -exit -./run_driver --dependency --push --roots $num_roots -exit +if false ; then ./run_driver --dlopen --push --roots $num_roots ./run_driver --dlreopen --push --roots $num_roots ./run_driver --reorder --push --roots $num_roots ./run_driver --partial --push --roots $num_roots ./run_driver --ldpreload --push --roots $num_roots -if false; then ./run_driver --dependency --pull --roots $num_roots ./run_driver --dlopen --pull --roots $num_roots ./run_driver --dlreopen --pull --roots $num_roots @@ -28,6 +26,7 @@ if test "x$SPINDLE_BLUEGENE" != "xtrue"; then ./run_driver --partial --fork --roots $num_roots ./run_driver --ldpreload --fork --roots $num_roots fi +fi if test "x$SPINDLE_BLUEGENE" != "xtrue"; then ./run_driver --dependency --forkexec --roots $num_roots @@ -51,10 +50,6 @@ fi ./run_driver --reorder --preload --roots $num_roots ./run_driver --partial --preload --roots $num_roots ./run_driver --ldpreload --preload --roots $num_roots -fi -#sleep 10 -#handler -echo "Done." From f7f019455dd40b03eeeb0e70dcedf2fe28a49adc Mon Sep 17 00:00:00 2001 From: Kento Sato Date: Mon, 22 May 2017 16:15:07 -0700 Subject: [PATCH 11/11] minor update --- testsuite/runTests_forest | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/testsuite/runTests_forest b/testsuite/runTests_forest index 69021942..0d82e550 100755 --- a/testsuite/runTests_forest +++ b/testsuite/runTests_forest @@ -4,7 +4,6 @@ export SPINDLE_BLUEGENE="false" num_roots=$1 -if false ; then ./run_driver --dlopen --push --roots $num_roots ./run_driver --dlreopen --push --roots $num_roots ./run_driver --reorder --push --roots $num_roots @@ -26,7 +25,7 @@ if test "x$SPINDLE_BLUEGENE" != "xtrue"; then ./run_driver --partial --fork --roots $num_roots ./run_driver --ldpreload --fork --roots $num_roots fi -fi + if test "x$SPINDLE_BLUEGENE" != "xtrue"; then ./run_driver --dependency --forkexec --roots $num_roots