Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed: Naemon stops executing checks and doesnt respawn Core Worker #419

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 50 additions & 22 deletions src/naemon/workers.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ static struct wproc_list *to_remove = NULL;
unsigned int wproc_num_workers_online = 0, wproc_num_workers_desired = 0;
unsigned int wproc_num_workers_spawned = 0;

static int get_desired_workers(int desired_workers);
static int spawn_core_worker(void);

#define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0)

static void wproc_logdump_buffer(int debuglevel, int verbosity, const char *prefix, char *buf)
Expand Down Expand Up @@ -414,6 +417,7 @@ static int handle_worker_result(int sd, int events, void *arg)
char *buf, *error_reason = NULL;
size_t size;
int ret;
unsigned int desired_workers;
struct wproc_worker *wp = (struct wproc_worker *)arg;

ret = nm_bufferqueue_read(wp->bq, wp->sd);
Expand All @@ -428,17 +432,32 @@ static int handle_worker_result(int sd, int events, void *arg)
nm_log(NSLOG_INFO_MESSAGE, "wproc: Socket to worker %s broken, removing", wp->name);
wproc_num_workers_online--;
iobroker_unregister(nagios_iobs, sd);
if (workers.len <= 0) {

/* remove worker from worker list - this ensures that we don't reassign
* its jobs back to itself*/
remove_worker(wp);

desired_workers = get_desired_workers(num_check_workers);

if (workers.len < desired_workers) {
/* there aren't global workers left, we can't run any more checks
* we should try respawning a few of the standard ones
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker");

/* Respawn a worker */
if ((ret = spawn_core_worker()) < 0) {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker");
} else {
nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret);
}
} else if (workers.len == 0) {
/* there aren't global workers left, we can't run any more checks
* this should never happen, because the respawning will be done in the upper if condition
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!");
}

/* remove worker from worker list - this ensures that we don't reassign
* its jobs back to itself*/
remove_worker(wp);

/* reassign this dead worker's jobs */
g_hash_table_iter_init(&iter, wp->jobs);
while (g_hash_table_iter_next(&iter, NULL, &job_)) {
Expand Down Expand Up @@ -664,24 +683,8 @@ static int spawn_core_worker(void)
}


int init_workers(int desired_workers)
static int get_desired_workers(int desired_workers)
{
int i;

/*
* we register our query handler before launching workers,
* so other workers can join us whenever they're ready
*/
specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal,
free, NULL
);
if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) {
log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n");
} else {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n");
return -1;
}

if (desired_workers <= 0) {
int cpus = online_cpus();

Expand All @@ -708,6 +711,31 @@ int init_workers(int desired_workers)
if (desired_workers < (int)workers.len)
return -1;

return desired_workers;
}


int init_workers(int desired_workers)
{
int i;

/*
* we register our query handler before launching workers,
* so other workers can join us whenever they're ready
*/
specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal,
free, NULL
);
if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) {
log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n");
} else {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n");
return -1;
}

/* Get the number of workers we need */
desired_workers = get_desired_workers(desired_workers);

for (i = 0; i < desired_workers; i++)
spawn_core_worker();

Expand Down