Skip to content

Commit

Permalink
add problem timestamps and duration
Browse files Browse the repository at this point in the history
this PR makes hosts / services save the start and end timestamp of the current
problem. Those values can then be used as macros, ex. in notification scripts.
For this, there are several new macros available:

- $HOSTPROBLEMSTART$ start timestamp of problem
- $HOSTPROBLEMEND$ end timestamp of problem (or zero if problem still persists)
- $HOSTPROBLEMDURATIONSEC$ duration of problem
- $HOSTPROBLEMDURATION$ duration as human readable text

the same macros exist for services:
- $SERVICEPROBLEMSTART$
- $SERVICEPROBLEMEND$
- $SERVICEPROBLEMDURATIONSEC$
- $SERVICEPROBLEMDURATION$

While there is a currently ongoing problem, the values point to this current
problem and the end timestamp is zero. Once the problem is resolved, the values
can still be used and won't change until a new problem starts.

This makes it possible to use the problem duration in recovery notifications which
otherwise would not be possible.
  • Loading branch information
sni committed Jan 26, 2024
1 parent d8e1f42 commit c749cab
Show file tree
Hide file tree
Showing 11 changed files with 118 additions and 35 deletions.
14 changes: 13 additions & 1 deletion lib/nsutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,19 @@ const char *mkstr(const char *fmt, ...)
return ret;
}


/* format duration seconds into human readable string */
const char* duration_string(unsigned long duration) {
int days, hours, minutes, seconds;

days = duration / 86400;
duration -= (days * 86400);
hours = duration / 3600;
duration -= (hours * 3600);
minutes = duration / 60;
duration -= (minutes * 60);
seconds = duration;
return (char *)mkstr("%dd %dh %dm %ds", days, hours, minutes, seconds);
}

/* close and reopen stdin, stdout and stderr to /dev/null */
void close_standard_fds(void)
Expand Down
8 changes: 8 additions & 0 deletions lib/nsutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ extern int online_cpus(void);
extern const char *mkstr(const char *fmt, ...)
__attribute__((__format__(__printf__, 1, 2)));

/**
* format duration seconds into human readable string.
* @note The returned string must *not* be free()'d!
* @param[in] duration The duration in seconds
* @return A pointer to the formatted string on success. Undefined on errors
*/
extern const char *duration_string(unsigned long);

/**
* Calculate the millisecond delta between two timeval structs
* @param[in] start The start time
Expand Down
3 changes: 3 additions & 0 deletions src/naemon/checks_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -1010,12 +1010,15 @@ static int handle_host_state(host *hst, int *alert_recorded)
/* don't reset last problem id, or it will be zero the next time a problem is encountered */
hst->current_problem_id = next_problem_id;
next_problem_id++;
hst->problem_start = current_time;
hst->problem_end = 0L;
}

/* clear the problem id when transitioning from a problem state to an UP state */
if (hst->current_state == STATE_UP) {
hst->last_problem_id = hst->current_problem_id;
hst->current_problem_id = 0L;
hst->problem_end = current_time;
}

/* write the host state change to the main log file */
Expand Down
4 changes: 4 additions & 0 deletions src/naemon/checks_service.c
Original file line number Diff line number Diff line change
Expand Up @@ -688,12 +688,16 @@ int handle_async_service_check_result(service *temp_service, check_result *queue
/* don't reset last problem id, or it will be zero the next time a problem is encountered */
temp_service->current_problem_id = next_problem_id;
next_problem_id++;
temp_service->problem_start = current_time;
temp_service->problem_end = 0L;
}

/* clear the problem id when transitioning from a problem state to an OK state */
if (temp_service->current_state == STATE_OK) {
temp_service->last_problem_id = temp_service->current_problem_id;
temp_service->current_problem_id = 0L;
temp_service->current_problem_id = 0L;
temp_service->problem_end = current_time;
}
}

Expand Down
90 changes: 59 additions & 31 deletions src/naemon/macros.c
Original file line number Diff line number Diff line change
Expand Up @@ -627,10 +627,6 @@ static int grab_standard_host_macro_r(nagios_macros *mac, int macro_type, host *
objectlist *temp_objectlist = NULL;
time_t current_time = 0L;
unsigned long duration = 0L;
int days = 0;
int hours = 0;
int minutes = 0;
int seconds = 0;
char *buf1 = NULL;
char *buf2 = NULL;
int total_host_services = 0;
Expand Down Expand Up @@ -708,19 +704,10 @@ static int grab_standard_host_macro_r(nagios_macros *mac, int macro_type, host *
case MACRO_HOSTDURATION:
time(&current_time);
duration = (unsigned long)(current_time - temp_host->last_state_change);

if (macro_type == MACRO_HOSTDURATIONSEC)
*output = (char *)mkstr("%lu", duration);
else {

days = duration / 86400;
duration -= (days * 86400);
hours = duration / 3600;
duration -= (hours * 3600);
minutes = duration / 60;
duration -= (minutes * 60);
seconds = duration;
*output = (char *)mkstr("%dd %dh %dm %ds", days, hours, minutes, seconds);
*output = (char *)mkstr("%s", duration_string(duration));
}
break;
case MACRO_HOSTEXECUTIONTIME:
Expand Down Expand Up @@ -762,6 +749,26 @@ static int grab_standard_host_macro_r(nagios_macros *mac, int macro_type, host *
case MACRO_LASTHOSTPROBLEMID:
*output = (char *)mkstr("%lu", temp_host->last_problem_id);
break;
case MACRO_HOSTPROBLEMSTART:
*output = (char *)mkstr("%lu", (unsigned long)temp_host->problem_start);
break;
case MACRO_HOSTPROBLEMEND:
*output = (char *)mkstr("%lu", (unsigned long)temp_host->problem_end);
break;
case MACRO_HOSTPROBLEMDURATIONSEC:
case MACRO_HOSTPROBLEMDURATION:
if(temp_host->problem_end > 0) {
duration = (unsigned long)(temp_host->problem_end - temp_host->problem_start);
} else if(temp_host->problem_start > 0) {
time(&current_time);
duration = (unsigned long)(current_time - temp_host->problem_start);
}
if (macro_type == MACRO_HOSTPROBLEMDURATIONSEC)
*output = (char *)mkstr("%lu", duration);
else {
*output = (char *)mkstr("%s", duration_string(duration));
}
break;
case MACRO_HOSTACTIONURL:
if (temp_host->action_url)
*output = temp_host->action_url;
Expand Down Expand Up @@ -950,10 +957,6 @@ static int grab_standard_service_macro_r(nagios_macros *mac, int macro_type, ser
objectlist *temp_objectlist = NULL;
time_t current_time = 0L;
unsigned long duration = 0L;
int days = 0;
int hours = 0;
int minutes = 0;
int seconds = 0;
char *buf1 = NULL;
char *buf2 = NULL;

Expand Down Expand Up @@ -1044,24 +1047,12 @@ static int grab_standard_service_macro_r(nagios_macros *mac, int macro_type, ser
break;
case MACRO_SERVICEDURATIONSEC:
case MACRO_SERVICEDURATION:

time(&current_time);
duration = (unsigned long)(current_time - temp_service->last_state_change);

/* get the state duration in seconds */
if (macro_type == MACRO_SERVICEDURATIONSEC)
*output = (char *)mkstr("%lu", duration);

/* get the state duration */
else {
days = duration / 86400;
duration -= (days * 86400);
hours = duration / 3600;
duration -= (hours * 3600);
minutes = duration / 60;
duration -= (minutes * 60);
seconds = duration;
*output = (char *)mkstr("%dd %dh %dm %ds", days, hours, minutes, seconds);
*output = (char *)mkstr("%s", duration_string(duration));
}
break;
case MACRO_SERVICENOTIFICATIONNUMBER:
Expand All @@ -1082,6 +1073,26 @@ static int grab_standard_service_macro_r(nagios_macros *mac, int macro_type, ser
case MACRO_LASTSERVICEPROBLEMID:
*output = (char *)mkstr("%lu", temp_service->last_problem_id);
break;
case MACRO_SERVICEPROBLEMSTART:
*output = (char *)mkstr("%lu", (unsigned long)temp_service->problem_start);
break;
case MACRO_SERVICEPROBLEMEND:
*output = (char *)mkstr("%lu", (unsigned long)temp_service->problem_end);
break;
case MACRO_SERVICEPROBLEMDURATIONSEC:
case MACRO_SERVICEPROBLEMDURATION:
if(temp_service->problem_end > 0) {
duration = (unsigned long)(temp_service->problem_end - temp_service->problem_start);
} else if(temp_service->problem_start > 0) {
time(&current_time);
duration = (unsigned long)(current_time - temp_service->problem_start);
}
if (macro_type == MACRO_SERVICEPROBLEMDURATIONSEC)
*output = (char *)mkstr("%lu", duration);
else {
*output = (char *)mkstr("%s", duration_string(duration));
}
break;
case MACRO_SERVICEACTIONURL:
if (temp_service->action_url)
*output = temp_service->action_url;
Expand Down Expand Up @@ -1569,6 +1580,11 @@ static int grab_macrox_value_r(nagios_macros *mac, int macro_type, char *arg1, c
case MACRO_LASTHOSTPROBLEMID:
case MACRO_LASTHOSTSTATE:
case MACRO_LASTHOSTSTATEID:
case MACRO_HOSTPROBLEMSTART:
case MACRO_HOSTPROBLEMEND:
case MACRO_HOSTPROBLEMDURATIONSEC:
case MACRO_HOSTPROBLEMDURATION:


/* a standard host macro */
if (arg2 == NULL) {
Expand Down Expand Up @@ -1688,6 +1704,10 @@ static int grab_macrox_value_r(nagios_macros *mac, int macro_type, char *arg1, c
case MACRO_LASTSERVICEPROBLEMID:
case MACRO_LASTSERVICESTATE:
case MACRO_LASTSERVICESTATEID:
case MACRO_SERVICEPROBLEMSTART:
case MACRO_SERVICEPROBLEMEND:
case MACRO_SERVICEPROBLEMDURATIONSEC:
case MACRO_SERVICEPROBLEMDURATION:

/* use saved service pointer */
if (arg1 == NULL && arg2 == NULL) {
Expand Down Expand Up @@ -2691,6 +2711,14 @@ int init_macrox_names(void)
add_macrox_name(HOSTVALUE);
add_macrox_name(SERVICEVALUE);
add_macrox_name(PROBLEMVALUE);
add_macrox_name(HOSTPROBLEMSTART);
add_macrox_name(HOSTPROBLEMEND);
add_macrox_name(HOSTPROBLEMDURATIONSEC);
add_macrox_name(HOSTPROBLEMDURATION);
add_macrox_name(SERVICEPROBLEMSTART);
add_macrox_name(SERVICEPROBLEMEND);
add_macrox_name(SERVICEPROBLEMDURATIONSEC);
add_macrox_name(SERVICEPROBLEMDURATION);

return OK;
}
Expand Down
12 changes: 10 additions & 2 deletions src/naemon/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
/****************** MACRO DEFINITIONS *****************/
#define MACRO_ENV_VAR_PREFIX "NAGIOS_"
#define MAX_USER_MACROS 256 /* max $USERx$ macros */
#define MACRO_X_COUNT 156 /* size of macro_x[] array */
#define MACRO_X_COUNT 164 /* size of macro_x[] array */

NAGIOS_BEGIN_DECL

Expand Down Expand Up @@ -201,7 +201,15 @@ typedef struct nagios_macros nagios_macros;
#define MACRO_HOSTVALUE 153
#define MACRO_SERVICEVALUE 154
#define MACRO_PROBLEMVALUE 155

#define MACRO_HOSTPROBLEMSTART 156
#define MACRO_HOSTPROBLEMEND 157
#define MACRO_HOSTPROBLEMDURATIONSEC 158
#define MACRO_HOSTPROBLEMDURATION 159
#define MACRO_SERVICEPROBLEMSTART 160
#define MACRO_SERVICEPROBLEMEND 161
#define MACRO_SERVICEPROBLEMDURATIONSEC 162
#define MACRO_SERVICEPROBLEMDURATION 163
/* NOTE: update MACRO_X_COUNT above to highest macro + 1 */

/************* MACRO CLEANING OPTIONS *****************/
#define STRIP_ILLEGAL_MACRO_CHARS 1
Expand Down
2 changes: 1 addition & 1 deletion src/naemon/nebmodules.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ NAGIOS_BEGIN_DECL

/***** MODULE VERSION INFORMATION *****/
#define NEB_API_VERSION(x) int __neb_api_version = x;
#define CURRENT_NEB_API_VERSION 6
#define CURRENT_NEB_API_VERSION 7


/***** MODULE INFORMATION *****/
Expand Down
2 changes: 2 additions & 0 deletions src/naemon/objects_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ struct host {
unsigned long last_event_id;
unsigned long current_problem_id;
unsigned long last_problem_id;
time_t problem_start;
time_t problem_end;
double latency;
double execution_time;
int is_executing;
Expand Down
2 changes: 2 additions & 0 deletions src/naemon/objects_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ struct service {
unsigned long last_event_id;
unsigned long current_problem_id;
unsigned long last_problem_id;
time_t problem_start;
time_t problem_end;
time_t last_notification;
time_t next_notification;
int no_more_notifications;
Expand Down
12 changes: 12 additions & 0 deletions src/naemon/xrddefault.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ int xrddefault_save_state_information(void)
fprintf(fp, "current_event_id=%lu\n", temp_host->current_event_id);
fprintf(fp, "current_problem_id=%lu\n", temp_host->current_problem_id);
fprintf(fp, "last_problem_id=%lu\n", temp_host->last_problem_id);
fprintf(fp, "problem_start=%lu\n", temp_host->problem_start);
fprintf(fp, "problem_end=%lu\n", temp_host->problem_end);
fprintf(fp, "plugin_output=%s\n", (temp_host->plugin_output == NULL) ? "" : temp_host->plugin_output);
fprintf(fp, "long_plugin_output=%s\n", (temp_host->long_plugin_output == NULL) ? "" : temp_host->long_plugin_output);
fprintf(fp, "performance_data=%s\n", (temp_host->perf_data == NULL) ? "" : temp_host->perf_data);
Expand Down Expand Up @@ -275,6 +277,8 @@ int xrddefault_save_state_information(void)
fprintf(fp, "current_event_id=%lu\n", temp_service->current_event_id);
fprintf(fp, "current_problem_id=%lu\n", temp_service->current_problem_id);
fprintf(fp, "last_problem_id=%lu\n", temp_service->last_problem_id);
fprintf(fp, "problem_start=%lu\n", temp_service->problem_start);
fprintf(fp, "problem_end=%lu\n", temp_service->problem_end);
fprintf(fp, "current_attempt=%d\n", temp_service->current_attempt);
fprintf(fp, "max_attempts=%d\n", temp_service->max_attempts);
fprintf(fp, "normal_check_interval=%f\n", temp_service->check_interval);
Expand Down Expand Up @@ -1064,6 +1068,10 @@ int xrddefault_read_state_information(void)
temp_host->current_problem_id = strtoul(val, NULL, 10);
else if (!strcmp(var, "last_problem_id"))
temp_host->last_problem_id = strtoul(val, NULL, 10);
else if (!strcmp(var, "problem_start"))
temp_host->problem_start = strtoul(val, NULL, 10);
else if (!strcmp(var, "problem_end"))
temp_host->problem_end = strtoul(val, NULL, 10);
else if (!strcmp(var, "state_type"))
temp_host->state_type = atoi(val);
else if (!strcmp(var, "last_state_change"))
Expand Down Expand Up @@ -1310,6 +1318,10 @@ int xrddefault_read_state_information(void)
temp_service->current_problem_id = strtoul(val, NULL, 10);
else if (!strcmp(var, "last_problem_id"))
temp_service->last_problem_id = strtoul(val, NULL, 10);
else if (!strcmp(var, "problem_start"))
temp_service->problem_start = strtoul(val, NULL, 10);
else if (!strcmp(var, "problem_end"))
temp_service->problem_end = strtoul(val, NULL, 10);
else if (!strcmp(var, "state_type"))
temp_service->state_type = atoi(val);
else if (!strcmp(var, "last_state_change"))
Expand Down
4 changes: 4 additions & 0 deletions src/naemon/xsddefault.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ int xsddefault_save_status_data(void)
fprintf(fp, "\tcurrent_event_id=%lu\n", temp_host->current_event_id);
fprintf(fp, "\tcurrent_problem_id=%lu\n", temp_host->current_problem_id);
fprintf(fp, "\tlast_problem_id=%lu\n", temp_host->last_problem_id);
fprintf(fp, "\tproblem_start=%lu\n", temp_host->problem_start);
fprintf(fp, "\tproblem_end=%lu\n", temp_host->problem_end);
fprintf(fp, "\tplugin_output=%s\n", (temp_host->plugin_output == NULL) ? "" : temp_host->plugin_output);
fprintf(fp, "\tlong_plugin_output=%s\n", (temp_host->long_plugin_output == NULL) ? "" : temp_host->long_plugin_output);
fprintf(fp, "\tperformance_data=%s\n", (temp_host->perf_data == NULL) ? "" : temp_host->perf_data);
Expand Down Expand Up @@ -269,6 +271,8 @@ int xsddefault_save_status_data(void)
fprintf(fp, "\tcurrent_event_id=%lu\n", temp_service->current_event_id);
fprintf(fp, "\tcurrent_problem_id=%lu\n", temp_service->current_problem_id);
fprintf(fp, "\tlast_problem_id=%lu\n", temp_service->last_problem_id);
fprintf(fp, "\tproblem_start=%lu\n", temp_service->problem_start);
fprintf(fp, "\tproblem_end=%lu\n", temp_service->problem_end);
fprintf(fp, "\tcurrent_attempt=%d\n", temp_service->current_attempt);
fprintf(fp, "\tmax_attempts=%d\n", temp_service->max_attempts);
fprintf(fp, "\tstate_type=%d\n", temp_service->state_type);
Expand Down

0 comments on commit c749cab

Please sign in to comment.