Skip to content

Commit

Permalink
Fixed: Naemon stops executing checks and doesnt respawn Core Worker
Browse files Browse the repository at this point in the history
processes (naemon#418)
  • Loading branch information
ccztux committed Feb 27, 2023
1 parent 2916d62 commit 347c547
Showing 1 changed file with 51 additions and 23 deletions.
74 changes: 51 additions & 23 deletions src/naemon/workers.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ static struct wproc_list *to_remove = NULL;
unsigned int wproc_num_workers_online = 0, wproc_num_workers_desired = 0;
unsigned int wproc_num_workers_spawned = 0;

static int get_desired_workers(int desired_workers);
static int spawn_core_worker(void);

#define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0)

static void wproc_logdump_buffer(int debuglevel, int verbosity, const char *prefix, char *buf)
Expand Down Expand Up @@ -414,6 +417,7 @@ static int handle_worker_result(int sd, int events, void *arg)
char *buf, *error_reason = NULL;
size_t size;
int ret;
unsigned int desired_workers;
struct wproc_worker *wp = (struct wproc_worker *)arg;

ret = nm_bufferqueue_read(wp->bq, wp->sd);
Expand All @@ -428,17 +432,32 @@ static int handle_worker_result(int sd, int events, void *arg)
nm_log(NSLOG_INFO_MESSAGE, "wproc: Socket to worker %s broken, removing", wp->name);
wproc_num_workers_online--;
iobroker_unregister(nagios_iobs, sd);
if (workers.len <= 0) {
/* there aren't global workers left, we can't run any more checks
* we should try respawning a few of the standard ones
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!");
}

/* remove worker from worker list - this ensures that we don't reassign
* its jobs back to itself*/
remove_worker(wp);

desired_workers = get_desired_workers(num_check_workers);

if (workers.len < desired_workers) {
/* there aren't global workers left, we can't run any more checks
* we should try respawning a few of the standard ones
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker");

/* Respawn a worker */
if ((ret = spawn_core_worker()) < 0) {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker");
} else {
nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret);
}
} else if (workers.len == 0) {
/* there aren't global workers left, we can't run any more checks
* we should try respawning a few of the standard ones
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!");
}

/* reassign this dead worker's jobs */
g_hash_table_iter_init(&iter, wp->jobs);
while (g_hash_table_iter_next(&iter, NULL, &job_)) {
Expand Down Expand Up @@ -664,24 +683,8 @@ static int spawn_core_worker(void)
}


int init_workers(int desired_workers)
static int get_desired_workers(int desired_workers)
{
int i;

/*
* we register our query handler before launching workers,
* so other workers can join us whenever they're ready
*/
specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal,
free, NULL
);
if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) {
log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n");
} else {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n");
return -1;
}

if (desired_workers <= 0) {
int cpus = online_cpus();

Expand All @@ -708,6 +711,31 @@ int init_workers(int desired_workers)
if (desired_workers < (int)workers.len)
return -1;

return desired_workers;
}


int init_workers(int desired_workers)
{
int i;

/*
* we register our query handler before launching workers,
* so other workers can join us whenever they're ready
*/
specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal,
free, NULL
);
if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) {
log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n");
} else {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n");
return -1;
}

/* Get the number of workers we need */
desired_workers = get_desired_workers(desired_workers);

for (i = 0; i < desired_workers; i++)
spawn_core_worker();

Expand Down

0 comments on commit 347c547

Please sign in to comment.