diff --git a/src/naemon/checks_host.c b/src/naemon/checks_host.c index 1c893b46..f9246254 100644 --- a/src/naemon/checks_host.c +++ b/src/naemon/checks_host.c @@ -633,33 +633,35 @@ static void handle_worker_host_check(wproc_result *wpres, void *arg, int flags) if (currently_running_host_checks > 0) currently_running_host_checks--; - hst = find_host(cr->host_name); - if (hst && wpres) { - hst->is_executing = FALSE; - memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage)); - cr->start_time.tv_sec = wpres->start.tv_sec; - cr->start_time.tv_usec = wpres->start.tv_usec; - cr->finish_time.tv_sec = wpres->stop.tv_sec; - cr->finish_time.tv_usec = wpres->stop.tv_usec; - if (WIFEXITED(wpres->wait_status)) { - cr->return_code = WEXITSTATUS(wpres->wait_status); - } else { - cr->return_code = STATE_UNKNOWN; - } + if (wpres) { + hst = find_host(cr->host_name); + if (hst) { + hst->is_executing = FALSE; + memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage)); + cr->start_time.tv_sec = wpres->start.tv_sec; + cr->start_time.tv_usec = wpres->start.tv_usec; + cr->finish_time.tv_sec = wpres->stop.tv_sec; + cr->finish_time.tv_usec = wpres->stop.tv_usec; + if (WIFEXITED(wpres->wait_status)) { + cr->return_code = WEXITSTATUS(wpres->wait_status); + } else { + cr->return_code = STATE_UNKNOWN; + } - if (wpres->outstd && *wpres->outstd) { - cr->output = nm_strdup(wpres->outstd); - } else if (wpres->outerr && *wpres->outerr) { - nm_asprintf(&cr->output, "(No output on stdout) stderr: %s", wpres->outerr); - } else { - cr->output = NULL; - } + if (wpres->outstd && *wpres->outstd) { + cr->output = nm_strdup(wpres->outstd); + } else if (wpres->outerr && *wpres->outerr) { + nm_asprintf(&cr->output, "(No output on stdout) stderr: %s", wpres->outerr); + } else { + cr->output = NULL; + } - cr->early_timeout = wpres->early_timeout; - cr->exited_ok = wpres->exited_ok; - cr->engine = NULL; - cr->source = wpres->source; - process_check_result(cr); + cr->early_timeout = wpres->early_timeout; + cr->exited_ok = wpres->exited_ok; + cr->engine = NULL; + cr->source = wpres->source; + process_check_result(cr); + } } free_check_result(cr); nm_free(cr); diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 35ac0d67..f15b98be 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -64,6 +64,9 @@ static struct wproc_list *to_remove = NULL; unsigned int wproc_num_workers_online = 0, wproc_num_workers_desired = 0; unsigned int wproc_num_workers_spawned = 0; +static int get_desired_workers(int desired_workers); +static int spawn_core_worker(void); + #define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0) static void wproc_logdump_buffer(int debuglevel, int verbosity, const char *prefix, char *buf) @@ -160,6 +163,10 @@ static void run_job_callback(struct wproc_job *job, struct wproc_result *wpres, { if (!job || !job->callback) return; + + if (!wpres) { + return; + } (*job->callback)(wpres, job->data, val); job->callback = NULL; @@ -414,6 +421,7 @@ static int handle_worker_result(int sd, int events, void *arg) char *buf, *error_reason = NULL; size_t size; int ret; + unsigned int desired_workers; struct wproc_worker *wp = (struct wproc_worker *)arg; ret = nm_bufferqueue_read(wp->bq, wp->sd); @@ -428,17 +436,32 @@ static int handle_worker_result(int sd, int events, void *arg) nm_log(NSLOG_INFO_MESSAGE, "wproc: Socket to worker %s broken, removing", wp->name); wproc_num_workers_online--; iobroker_unregister(nagios_iobs, sd); - if (workers.len <= 0) { + + /* remove worker from worker list - this ensures that we don't reassign + * its jobs back to itself*/ + remove_worker(wp); + + desired_workers = get_desired_workers(num_check_workers); + + if (workers.len < desired_workers) { /* there aren't global workers left, we can't run any more checks * we should try respawning a few of the standard ones */ + nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker"); + + /* Respawn a worker */ + if ((ret = spawn_core_worker()) < 0) { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker"); + } else { + nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret); + } + } else if (workers.len == 0) { + /* there aren't global workers left, we can't run any more checks + * this should never happen, because the respawning will be done in the upper if condition + */ nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); } - /* remove worker from worker list - this ensures that we don't reassign - * its jobs back to itself*/ - remove_worker(wp); - /* reassign this dead worker's jobs */ g_hash_table_iter_init(&iter, wp->jobs); while (g_hash_table_iter_next(&iter, NULL, &job_)) { @@ -449,7 +472,7 @@ static int handle_worker_result(int sd, int events, void *arg) ); } - wproc_destroy(wp, 0); + wproc_destroy(wp, WPROC_FORCE); return 0; } while ((buf = worker_ioc2msg(wp->bq, &size, 0))) { @@ -664,24 +687,8 @@ static int spawn_core_worker(void) } -int init_workers(int desired_workers) +static int get_desired_workers(int desired_workers) { - int i; - - /* - * we register our query handler before launching workers, - * so other workers can join us whenever they're ready - */ - specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal, - free, NULL - ); - if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) { - log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n"); - } else { - nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n"); - return -1; - } - if (desired_workers <= 0) { int cpus = online_cpus(); @@ -699,8 +706,34 @@ int init_workers(int desired_workers) } } } + wproc_num_workers_desired = desired_workers; + return desired_workers; +} + + +int init_workers(int desired_workers) +{ + int i; + + /* + * we register our query handler before launching workers, + * so other workers can join us whenever they're ready + */ + specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal, + free, NULL + ); + if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) { + log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n"); + } else { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n"); + return -1; + } + + /* Get the number of workers we need */ + desired_workers = get_desired_workers(desired_workers); + if (workers_alive() == desired_workers) return 0;