From 58a6f2ce06bc298b203b7108c93afc32c85538d2 Mon Sep 17 00:00:00 2001 From: Sven Nierlein Date: Mon, 18 Mar 2024 10:37:59 +0100 Subject: [PATCH] add new options to keep services running as long as they are ok. The issue with the options: - host_down_disable_service_checks - service_skip_check_dependency_status - service_skip_check_host_down_status - host_skip_check_dependency_status is that reports break because hosts/services suddenly stop executing and keep their OK state. Which makes those options pretty unusable. So in order to keep reporting correct, you need to keep services running, even if the host is down. With these new options, hosts/services keep on running as long as they are up. And as soon as the service is down, it stops running until the host comes back up. That way naemon has to do less checks, especially less checks which run into timeouts and such but reporting is still correct. The option service_skip_check_dependency_status=-2 will also be used for service parents. Recommended settings are: host_down_disable_service_checks=1 ; disable service checks if host is down service_skip_check_host_down_status=-2 ; but keep running as long as they are ok service_skip_check_dependency_status=-2 ; same, but for dependency checks. host_skip_check_dependency_status=-2 ; and for host checks --- sample-config/naemon.cfg.in | 3 ++- src/naemon/checks_service.c | 18 ++++++++++++++++-- src/naemon/configuration.c | 2 +- src/naemon/defaults.h | 1 + 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/sample-config/naemon.cfg.in b/sample-config/naemon.cfg.in index 091e3773..96d4edd3 100644 --- a/sample-config/naemon.cfg.in +++ b/sample-config/naemon.cfg.in @@ -1042,7 +1042,6 @@ allow_empty_hostgroup_assignment=0 # # While desirable in some environments, enabling this value can distort report # values as the expected quantity of checks will not have been performed - #host_down_disable_service_checks=0 # SET SERVICE/HOST STATUS WHEN SERVICE CHECK SKIPPED @@ -1053,6 +1052,7 @@ allow_empty_hostgroup_assignment=0 # is set to 1. # Valid values for the service* options are: # -1 Do not change the service status (default) +# -2 Keep service running as long as it is ok/warning. (also be used for service parents) # 0 Set the service status to STATE_OK # 1 Set the service status to STATE_WARNING # 2 Set the service status to STATE_CRITICAL @@ -1064,6 +1064,7 @@ allow_empty_hostgroup_assignment=0 # status of a host when its check is skipped due to a failed dependency check. # Valid values for the host_dependency_skip_check_status are: # -1 Do not change the service status (default) +# -2 Keep host running as long as it is up. # 0 Set the host status to STATE_UP # 1 Set the host status to STATE_DOWN # 2 Set the host status to STATE_UNREACHABLE diff --git a/src/naemon/checks_service.c b/src/naemon/checks_service.c index 0222a373..75a69925 100644 --- a/src/naemon/checks_service.c +++ b/src/naemon/checks_service.c @@ -218,9 +218,23 @@ static void handle_service_check_event(struct nm_event_execution_properties *evp if ((temp_host = temp_service->host_ptr) == NULL) { log_debug_info(DEBUGL_CHECKS, 2, "Host pointer NULL in handle_service_check_event().\n"); return; - } else { - if (temp_host->current_state != STATE_UP) { + } + if (temp_host->current_state != STATE_UP) { + int keep_running = TRUE; + switch (service_skip_check_host_down_status) { + /* only keep running if service is up or host_down_disable_service_checks is disabled */ + case SKIP_CHECK_KEEP_RUNNING_WHEN_UP: + if (temp_service->current_state > STATE_WARNING) { + log_debug_info(DEBUGL_CHECKS, 2, "Host and service state not UP, so service check will not be performed - will be rescheduled as normal.\n"); + keep_running = FALSE; + } + break; + default: log_debug_info(DEBUGL_CHECKS, 2, "Host state not UP, so service check will not be performed - will be rescheduled as normal.\n"); + keep_running = FALSE; + break; + } + if(!keep_running) { if (service_skip_check_host_down_status >= 0) { temp_service->current_state = service_skip_check_host_down_status; if (strstr(temp_service->plugin_output, "(host is down)") == NULL) { diff --git a/src/naemon/configuration.c b/src/naemon/configuration.c index 959ad2d4..2f6831a3 100644 --- a/src/naemon/configuration.c +++ b/src/naemon/configuration.c @@ -1076,7 +1076,7 @@ read_config_file(const char *main_config_file, nagios_macros *mac) } } else if (!strcmp(variable, "service_skip_check_host_down_status")) { service_skip_check_host_down_status = atoi(value); - if (service_skip_check_host_down_status < -1 || service_skip_check_host_down_status > 3) { + if (service_skip_check_host_down_status < -2 || service_skip_check_host_down_status > 3) { nm_asprintf(&error_message, "Illegal value for service_skip_check_host_down_status"); error = TRUE; break; diff --git a/src/naemon/defaults.h b/src/naemon/defaults.h index c1dd48a1..19c7a5b8 100644 --- a/src/naemon/defaults.h +++ b/src/naemon/defaults.h @@ -89,6 +89,7 @@ #define DEFAULT_ALLOW_CIRCULAR_DEPENDENCIES 0 /* Allow circular dependencies */ #define DEFAULT_HOST_DOWN_DISABLE_SERVICE_CHECKS 0 /* run service checks if the host is down */ #define DEFAULT_SKIP_CHECK_STATUS -1 /* do not change status by default */ +#define SKIP_CHECK_KEEP_RUNNING_WHEN_UP -2 /* run service checks as long as the host and service is up (ok/warning) */ #define DEFAULT_HOST_PERFDATA_FILE_TEMPLATE "[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$" #define DEFAULT_SERVICE_PERFDATA_FILE_TEMPLATE "[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$"