diff --git a/src/naemon/workers.c b/src/naemon/workers.c index 35ac0d67..368e186b 100644 --- a/src/naemon/workers.c +++ b/src/naemon/workers.c @@ -64,6 +64,9 @@ static struct wproc_list *to_remove = NULL; unsigned int wproc_num_workers_online = 0, wproc_num_workers_desired = 0; unsigned int wproc_num_workers_spawned = 0; +static int get_desired_workers(int desired_workers); +static int spawn_core_worker(void); + #define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0) static void wproc_logdump_buffer(int debuglevel, int verbosity, const char *prefix, char *buf) @@ -414,6 +417,7 @@ static int handle_worker_result(int sd, int events, void *arg) char *buf, *error_reason = NULL; size_t size; int ret; + unsigned int desired_workers; struct wproc_worker *wp = (struct wproc_worker *)arg; ret = nm_bufferqueue_read(wp->bq, wp->sd); @@ -428,17 +432,32 @@ static int handle_worker_result(int sd, int events, void *arg) nm_log(NSLOG_INFO_MESSAGE, "wproc: Socket to worker %s broken, removing", wp->name); wproc_num_workers_online--; iobroker_unregister(nagios_iobs, sd); - if (workers.len <= 0) { - /* there aren't global workers left, we can't run any more checks - * we should try respawning a few of the standard ones - */ - nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); - } /* remove worker from worker list - this ensures that we don't reassign * its jobs back to itself*/ remove_worker(wp); + desired_workers = get_desired_workers(num_check_workers); + + if (workers.len < desired_workers) { + /* there aren't global workers left, we can't run any more checks + * we should try respawning a few of the standard ones + */ + nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker"); + + /* Respawn a worker */ + if ((ret = spawn_core_worker()) < 0) { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker"); + } else { + nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret); + } + } else if (workers.len == 0) { + /* there aren't global workers left, we can't run any more checks + * we should try respawning a few of the standard ones + */ + nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!"); + } + /* reassign this dead worker's jobs */ g_hash_table_iter_init(&iter, wp->jobs); while (g_hash_table_iter_next(&iter, NULL, &job_)) { @@ -664,24 +683,8 @@ static int spawn_core_worker(void) } -int init_workers(int desired_workers) +static int get_desired_workers(int desired_workers) { - int i; - - /* - * we register our query handler before launching workers, - * so other workers can join us whenever they're ready - */ - specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal, - free, NULL - ); - if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) { - log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n"); - } else { - nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n"); - return -1; - } - if (desired_workers <= 0) { int cpus = online_cpus(); @@ -708,6 +711,31 @@ int init_workers(int desired_workers) if (desired_workers < (int)workers.len) return -1; + return desired_workers; +} + + +int init_workers(int desired_workers) +{ + int i; + + /* + * we register our query handler before launching workers, + * so other workers can join us whenever they're ready + */ + specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal, + free, NULL + ); + if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) { + log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n"); + } else { + nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n"); + return -1; + } + + /* Get the number of workers we need */ + desired_workers = get_desired_workers(desired_workers); + for (i = 0; i < desired_workers; i++) spawn_core_worker();