Skip to content

Commit

Permalink
Merge pull request #421 from naemon/pr/419
Browse files Browse the repository at this point in the history
Fixed: Naemon stops executing checks and doesnt respawn Core Worker #419
Happy new Year!
  • Loading branch information
nook24 committed Jan 3, 2024
2 parents 83b25ec + b829a68 commit 50b573a
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 48 deletions.
52 changes: 27 additions & 25 deletions src/naemon/checks_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -633,33 +633,35 @@ static void handle_worker_host_check(wproc_result *wpres, void *arg, int flags)
if (currently_running_host_checks > 0)
currently_running_host_checks--;

hst = find_host(cr->host_name);
if (hst && wpres) {
hst->is_executing = FALSE;
memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage));
cr->start_time.tv_sec = wpres->start.tv_sec;
cr->start_time.tv_usec = wpres->start.tv_usec;
cr->finish_time.tv_sec = wpres->stop.tv_sec;
cr->finish_time.tv_usec = wpres->stop.tv_usec;
if (WIFEXITED(wpres->wait_status)) {
cr->return_code = WEXITSTATUS(wpres->wait_status);
} else {
cr->return_code = STATE_UNKNOWN;
}
if (wpres) {
hst = find_host(cr->host_name);
if (hst) {
hst->is_executing = FALSE;
memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage));
cr->start_time.tv_sec = wpres->start.tv_sec;
cr->start_time.tv_usec = wpres->start.tv_usec;
cr->finish_time.tv_sec = wpres->stop.tv_sec;
cr->finish_time.tv_usec = wpres->stop.tv_usec;
if (WIFEXITED(wpres->wait_status)) {
cr->return_code = WEXITSTATUS(wpres->wait_status);
} else {
cr->return_code = STATE_UNKNOWN;
}

if (wpres->outstd && *wpres->outstd) {
cr->output = nm_strdup(wpres->outstd);
} else if (wpres->outerr && *wpres->outerr) {
nm_asprintf(&cr->output, "(No output on stdout) stderr: %s", wpres->outerr);
} else {
cr->output = NULL;
}
if (wpres->outstd && *wpres->outstd) {
cr->output = nm_strdup(wpres->outstd);
} else if (wpres->outerr && *wpres->outerr) {
nm_asprintf(&cr->output, "(No output on stdout) stderr: %s", wpres->outerr);
} else {
cr->output = NULL;
}

cr->early_timeout = wpres->early_timeout;
cr->exited_ok = wpres->exited_ok;
cr->engine = NULL;
cr->source = wpres->source;
process_check_result(cr);
cr->early_timeout = wpres->early_timeout;
cr->exited_ok = wpres->exited_ok;
cr->engine = NULL;
cr->source = wpres->source;
process_check_result(cr);
}
}
free_check_result(cr);
nm_free(cr);
Expand Down
79 changes: 56 additions & 23 deletions src/naemon/workers.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ static struct wproc_list *to_remove = NULL;
unsigned int wproc_num_workers_online = 0, wproc_num_workers_desired = 0;
unsigned int wproc_num_workers_spawned = 0;

static int get_desired_workers(int desired_workers);
static int spawn_core_worker(void);

#define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0)

static void wproc_logdump_buffer(int debuglevel, int verbosity, const char *prefix, char *buf)
Expand Down Expand Up @@ -160,6 +163,10 @@ static void run_job_callback(struct wproc_job *job, struct wproc_result *wpres,
{
if (!job || !job->callback)
return;

if (!wpres) {
return;
}

(*job->callback)(wpres, job->data, val);
job->callback = NULL;
Expand Down Expand Up @@ -414,6 +421,7 @@ static int handle_worker_result(int sd, int events, void *arg)
char *buf, *error_reason = NULL;
size_t size;
int ret;
unsigned int desired_workers;
struct wproc_worker *wp = (struct wproc_worker *)arg;

ret = nm_bufferqueue_read(wp->bq, wp->sd);
Expand All @@ -428,17 +436,32 @@ static int handle_worker_result(int sd, int events, void *arg)
nm_log(NSLOG_INFO_MESSAGE, "wproc: Socket to worker %s broken, removing", wp->name);
wproc_num_workers_online--;
iobroker_unregister(nagios_iobs, sd);
if (workers.len <= 0) {

/* remove worker from worker list - this ensures that we don't reassign
* its jobs back to itself*/
remove_worker(wp);

desired_workers = get_desired_workers(num_check_workers);

if (workers.len < desired_workers) {
/* there aren't global workers left, we can't run any more checks
* we should try respawning a few of the standard ones
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: We have have less Core Workers than we should have, trying to respawn Core Worker");

/* Respawn a worker */
if ((ret = spawn_core_worker()) < 0) {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to respawn Core Worker");
} else {
nm_log(NSLOG_INFO_MESSAGE, "wproc: Respawning Core Worker %u was successful", ret);
}
} else if (workers.len == 0) {
/* there aren't global workers left, we can't run any more checks
* this should never happen, because the respawning will be done in the upper if condition
*/
nm_log(NSLOG_RUNTIME_ERROR, "wproc: All our workers are dead, we can't do anything!");
}

/* remove worker from worker list - this ensures that we don't reassign
* its jobs back to itself*/
remove_worker(wp);

/* reassign this dead worker's jobs */
g_hash_table_iter_init(&iter, wp->jobs);
while (g_hash_table_iter_next(&iter, NULL, &job_)) {
Expand All @@ -449,7 +472,7 @@ static int handle_worker_result(int sd, int events, void *arg)
);
}

wproc_destroy(wp, 0);
wproc_destroy(wp, WPROC_FORCE);
return 0;
}
while ((buf = worker_ioc2msg(wp->bq, &size, 0))) {
Expand Down Expand Up @@ -664,24 +687,8 @@ static int spawn_core_worker(void)
}


int init_workers(int desired_workers)
static int get_desired_workers(int desired_workers)
{
int i;

/*
* we register our query handler before launching workers,
* so other workers can join us whenever they're ready
*/
specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal,
free, NULL
);
if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) {
log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n");
} else {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n");
return -1;
}

if (desired_workers <= 0) {
int cpus = online_cpus();

Expand All @@ -699,8 +706,34 @@ int init_workers(int desired_workers)
}
}
}

wproc_num_workers_desired = desired_workers;

return desired_workers;
}


int init_workers(int desired_workers)
{
int i;

/*
* we register our query handler before launching workers,
* so other workers can join us whenever they're ready
*/
specialized_workers = g_hash_table_new_full(g_str_hash, g_str_equal,
free, NULL
);
if (!qh_register_handler("wproc", "Worker process management and info", 0, wproc_query_handler)) {
log_debug_info(DEBUGL_IPC, DEBUGV_BASIC, "wproc: Successfully registered manager as @wproc with query handler\n");
} else {
nm_log(NSLOG_RUNTIME_ERROR, "wproc: Failed to register manager with query handler\n");
return -1;
}

/* Get the number of workers we need */
desired_workers = get_desired_workers(desired_workers);

if (workers_alive() == desired_workers)
return 0;

Expand Down

0 comments on commit 50b573a

Please sign in to comment.