From 347c54770ca530a2f30b063cf83411e136db6010 Mon Sep 17 00:00:00 2001 From: ccztux Date: Mon, 27 Feb 2023 16:57:00 +0100 Subject: [PATCH 1/7] Fixed: Naemon stops executing checks and doesnt respawn Core Worker processes (#418) --- src/naemon/workers.c | 74 ++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 35ac0d67f..368e186bb 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -64,6 +64,9 @@ static struct wproc_list *to_remove = NULL; unsigned int wproc_num_workers_online = 0, wproc_num_workers_desired = 0; unsigned int wproc_num_workers_spawned = 0; +static int get_desired_workers(int desired_workers); +static int spawn_core_worker(void); + #define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0) static void wproc_logdump_buffer(int debuglevel, int verbosity, const char *prefix, char *buf) @@ -414,6 +417,7 @@ static int handle_worker_result(int sd, int events, void *arg) char *buf, *error_reason = NULL; size_t size; int ret; + unsigned int desired_workers; struct wproc_worker *wp = (struct wproc_worker *)arg; ret = nm_bufferqueue_read(wp->bq, wp->sd); @@ -428,17 +432,32 @@ static int handle_worker_result(int sd, int events, void *arg) nm_log(NSLOG_INFO_MESSAGE, "wproc: Socket to worker %s broken, removing", wp->name); wproc_num_workers_online--; iobroker_unregister(nagios_iobs, sd); - if (workers.len <= 0) { - /* there aren't global workers left, we can't run any more checks - * we should try respawning a few of the standard ones - */ - nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); - } /* remove worker from worker list - this ensures that we don't reassign * its jobs back to itself*/ remove_worker(wp); + desired_workers = get_desired_workers(num_check_workers); + + if (workers.len < desired_workers) { + /* there aren't global workers left, we can't run any more checks + * we should try respawning a few of the standard ones + */ + nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker"); + + /* Respawn a worker */ + if ((ret = spawn_core_worker()) < 0) { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker"); + } else { + nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret); + } + } else if (workers.len == 0) { + /* there aren't global workers left, we can't run any more checks + * we should try respawning a few of the standard ones + */ + nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); + } + /* reassign this dead worker's jobs */ g_hash_table_iter_init(&iter, wp->jobs); while (g_hash_table_iter_next(&iter, NULL, &job_)) { @@ -664,24 +683,8 @@ static int spawn_core_worker(void) } -int init_workers(int desired_workers) +static int get_desired_workers(int desired_workers) { - int i; - - /* - * we register our query handler before launching workers, - * so other workers can join us whenever they're ready - */ - specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal, - free, NULL - ); - if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) { - log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n"); - } else { - nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n"); - return -1; - } - if (desired_workers <= 0) { int cpus = online_cpus(); @@ -708,6 +711,31 @@ int init_workers(int desired_workers) if (desired_workers < (int)workers.len) return -1; + return desired_workers; +} + + +int init_workers(int desired_workers) +{ + int i; + + /* + * we register our query handler before launching workers, + * so other workers can join us whenever they're ready + */ + specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal, + free, NULL + ); + if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) { + log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n"); + } else { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n"); + return -1; + } + + /* Get the number of workers we need */ + desired_workers = get_desired_workers(desired_workers); + for (i = 0; i < desired_workers; i++) spawn_core_worker(); From bf8bf43e0344d11833c446674453edc82e7cce20 Mon Sep 17 00:00:00 2001 From: Christian Zettel Date: Mon, 27 Feb 2023 17:10:00 +0100 Subject: [PATCH 2/7] Update workers.c Fixed the indentation stuff... --- src/naemon/workers.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 368e186bb..2cee17527 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -417,7 +417,7 @@ static int handle_worker_result(int sd, int events, void *arg) char *buf, *error_reason = NULL; size_t size; int ret; - unsigned int desired_workers; + unsigned int desired_workers; struct wproc_worker *wp = (struct wproc_worker *)arg; ret = nm_bufferqueue_read(wp->bq, wp->sd); @@ -437,7 +437,7 @@ static int handle_worker_result(int sd, int events, void *arg) * its jobs back to itself*/ remove_worker(wp); - desired_workers = get_desired_workers(num_check_workers); + desired_workers = get_desired_workers(num_check_workers); if (workers.len < desired_workers) { /* there aren't global workers left, we can't run any more checks @@ -445,18 +445,18 @@ static int handle_worker_result(int sd, int events, void *arg) */ nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker"); - /* Respawn a worker */ - if ((ret = spawn_core_worker()) < 0) { - nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker"); - } else { - nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret); - } + /* Respawn a worker */ + if ((ret = spawn_core_worker()) < 0) { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker"); + } else { + nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret); + } } else if (workers.len == 0) { /* there aren't global workers left, we can't run any more checks * we should try respawning a few of the standard ones */ nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); - } + } /* reassign this dead worker's jobs */ g_hash_table_iter_init(&iter, wp->jobs); @@ -711,7 +711,7 @@ static int get_desired_workers(int desired_workers) if (desired_workers < (int)workers.len) return -1; - return desired_workers; + return desired_workers; } @@ -733,8 +733,8 @@ int init_workers(int desired_workers) return -1; } - /* Get the number of workers we need */ - desired_workers = get_desired_workers(desired_workers); + /* Get the number of workers we need */ + desired_workers = get_desired_workers(desired_workers); for (i = 0; i < desired_workers; i++) spawn_core_worker(); From 302af625c8906f498986e1352cb272bc29ebc999 Mon Sep 17 00:00:00 2001 From: Christian Zettel Date: Mon, 27 Feb 2023 17:34:20 +0100 Subject: [PATCH 3/7] Update workers.c Comment changed. --- src/naemon/workers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 2cee17527..9674c5588 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -453,7 +453,7 @@ static int handle_worker_result(int sd, int events, void *arg) } } else if (workers.len == 0) { /* there aren't global workers left, we can't run any more checks - * we should try respawning a few of the standard ones + * this should never happen, because the respawning will be done in the upper if condition */ nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); } From 3b9b2f0491675e54d7ef8171a19dc6e28954b770 Mon Sep 17 00:00:00 2001 From: nook24 Date: Thu, 2 Mar 2023 21:52:13 +0000 Subject: [PATCH 4/7] Call wproc_destroy with the WPROC_FORCE flag, whenever a Naemon worker process dies. This will cleanup any processes. I'm not sure why the WPROC_FORCE flag exists at all. Maybe this could cause problems, when external workers not spawned by Naemon itself connect to the Query Handler? The original commit ec4dc03db0a08f06591d31edad987e6ae389ab37 from 10 years ago did not contain more information about this Signed-off-by: nook24 --- src/naemon/workers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 9674c5588..d1b68aab1 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -468,7 +468,7 @@ static int handle_worker_result(int sd, int events, void *arg) ); } - wproc_destroy(wp, 0); + wproc_destroy(wp, WPROC_FORCE); return 0; } while ((buf = worker_ioc2msg(wp->bq, &size, 0))) { From 7940a8187b2ab1a328481da5f0d4a77fe0f83169 Mon Sep 17 00:00:00 2001 From: ccztux Date: Mon, 6 Mar 2023 11:30:06 +0100 Subject: [PATCH 5/7] The function get_desired_workers now always returns the desired workers, as requested in https://github.com/naemon/naemon-core/pull/421#pullrequestreview-1323354780 --- src/naemon/workers.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/naemon/workers.c b/src/naemon/workers.c index d1b68aab1..588148b89 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -702,14 +702,8 @@ static int get_desired_workers(int desired_workers) } } } - wproc_num_workers_desired = desired_workers; - - if (workers_alive() == desired_workers) - return 0; - /* can't shrink the number of workers (yet) */ - if (desired_workers < (int)workers.len) - return -1; + wproc_num_workers_desired = desired_workers; return desired_workers; } @@ -736,6 +730,13 @@ int init_workers(int desired_workers) /* Get the number of workers we need */ desired_workers = get_desired_workers(desired_workers); + if (workers_alive() == desired_workers) + return 0; + + /* can't shrink the number of workers (yet) */ + if (desired_workers < (int)workers.len) + return -1; + for (i = 0; i < desired_workers; i++) spawn_core_worker(); From d6c4c6eaa86c3031c03457f48e74bcdc68aa9451 Mon Sep 17 00:00:00 2001 From: nook24 Date: Thu, 23 Mar 2023 21:00:16 +0000 Subject: [PATCH 6/7] Do not crash if wps is NULL. This could happen if a core worker dies and gets respawned Signed-off-by: nook24 --- src/naemon/checks_host.c | 52 +++++++++++++++++++++------------------- src/naemon/workers.c | 7 +++++- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/src/naemon/checks_host.c b/src/naemon/checks_host.c index e7f486207..91924f5b2 100644 --- a/src/naemon/checks_host.c +++ b/src/naemon/checks_host.c @@ -636,33 +636,35 @@ static void handle_worker_host_check(wproc_result *wpres, void *arg, int flags) if (currently_running_host_checks > 0) currently_running_host_checks--; - hst = find_host(cr->host_name); - if (hst && wpres) { - hst->is_executing = FALSE; - memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage)); - cr->start_time.tv_sec = wpres->start.tv_sec; - cr->start_time.tv_usec = wpres->start.tv_usec; - cr->finish_time.tv_sec = wpres->stop.tv_sec; - cr->finish_time.tv_usec = wpres->stop.tv_usec; - if (WIFEXITED(wpres->wait_status)) { - cr->return_code = WEXITSTATUS(wpres->wait_status); - } else { - cr->return_code = STATE_UNKNOWN; - } + if (wpres) { + hst = find_host(cr->host_name); + if (hst) { + hst->is_executing = FALSE; + memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage)); + cr->start_time.tv_sec = wpres->start.tv_sec; + cr->start_time.tv_usec = wpres->start.tv_usec; + cr->finish_time.tv_sec = wpres->stop.tv_sec; + cr->finish_time.tv_usec = wpres->stop.tv_usec; + if (WIFEXITED(wpres->wait_status)) { + cr->return_code = WEXITSTATUS(wpres->wait_status); + } else { + cr->return_code = STATE_UNKNOWN; + } - if (wpres->outstd && *wpres->outstd) { - cr->output = nm_strdup(wpres->outstd); - } else if (wpres->outerr && *wpres->outerr) { - nm_asprintf(&cr->output, "(No output on stdout) stderr: %s", wpres->outerr); - } else { - cr->output = NULL; - } + if (wpres->outstd && *wpres->outstd) { + cr->output = nm_strdup(wpres->outstd); + } else if (wpres->outerr && *wpres->outerr) { + nm_asprintf(&cr->output, "(No output on stdout) stderr: %s", wpres->outerr); + } else { + cr->output = NULL; + } - cr->early_timeout = wpres->early_timeout; - cr->exited_ok = wpres->exited_ok; - cr->engine = NULL; - cr->source = wpres->source; - process_check_result(cr); + cr->early_timeout = wpres->early_timeout; + cr->exited_ok = wpres->exited_ok; + cr->engine = NULL; + cr->source = wpres->source; + process_check_result(cr); + } } free_check_result(cr); nm_free(cr); diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 588148b89..53be1edd6 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -163,6 +163,11 @@ static void run_job_callback(struct wproc_job *job, struct wproc_result *wpres, { if (!job || !job->callback) return; + + if (!wpres) { + nm_log(NSLOG_RUNTIME_ERROR, "---!!!--- wpres is null or so TODO REMOVE THIS"); + return; + } (*job->callback)(wpres, job->data, val); job->callback = NULL; @@ -446,7 +451,7 @@ static int handle_worker_result(int sd, int events, void *arg) nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker"); /* Respawn a worker */ - if ((ret = spawn_core_worker()) < 0) { + if ((ret = spawn_core_worker()) < 0) { nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker"); } else { nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret); From b829a682e10415913b4da87ffded09c26b057d87 Mon Sep 17 00:00:00 2001 From: nook24 Date: Fri, 1 Dec 2023 21:15:02 +0100 Subject: [PATCH 7/7] Remove debugging output Signed-off-by: nook24 --- src/naemon/workers.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 53be1edd6..f15b98bef 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -165,7 +165,6 @@ static void run_job_callback(struct wproc_job *job, struct wproc_result *wpres, return; if (!wpres) { - nm_log(NSLOG_RUNTIME_ERROR, "---!!!--- wpres is null or so TODO REMOVE THIS"); return; }