From 5dcabf731956acec90203531199fbb1a87e7bb16 Mon Sep 17 00:00:00 2001 From: Solomon Jacobs Date: Wed, 3 Apr 2024 15:17:35 +0200 Subject: [PATCH] 16417 FIX Add Configuration Option 'checkmkAgentTimeout' CMK-16676 Closes: #26 Change-Id: I697df13efc1d6b5279396d626b0c335c51928892 --- .werks/16417 | 24 +++++++++++++++++++ .../node-collector-machine-sections-ds.yaml | 3 +++ deploy/charts/checkmk/values.yaml | 2 ++ src/checkmk_kube_agent/send_metrics.py | 6 ++--- 4 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 .werks/16417 diff --git a/.werks/16417 b/.werks/16417 new file mode 100644 index 0000000..387c421 --- /dev/null +++ b/.werks/16417 @@ -0,0 +1,24 @@ +Title: Add Configuration Option 'checkmkAgentTimeout' +Class: fix +Compatible: compat +Component: node-collector +Date: 1712152033 +Edition: cre +Knowledge: doc +Level: 1 +State: unknown +Version: 1.5.0 + +The machine-sections-collector executes a version of the 'check_mk_agent' to collect information +about the host. Sometimes this script takes more than five seconds, which causes the following +traceback. + +C+: + File "/usr/local/lib/python3.10/subprocess.py", line 1935, in _wait + raise TimeoutExpired(self.args, timeout) +subprocess.TimeoutExpired: Command '['/usr/local/bin/check_mk_agent']' timed out after 5 seconds +C-: + +If you encounter this error, you can configure a longer timeout via the new option +'nodeCollector.machineSectionsCollector.checkmkAgentTimeout' in the 'values.yaml' configuration +file. diff --git a/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml b/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml index ac9c89f..31c6471 100644 --- a/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml +++ b/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml @@ -61,6 +61,9 @@ spec: - "/usr/local/bin/checkmk-machine-sections-collector" args: - "--log-level={{ .Values.nodeCollector.logLevel }}" + {{- if .Values.nodeCollector.machineSectionsCollector.checkmkAgentTimeout }} + - "--checkmk-agent-timeout={{ .Values.nodeCollector.machineSectionsCollector.checkmkAgentTimeout }}" + {{- end }} {{- if .Values.tlsCommunication.enabled }} - "--secure-protocol" {{- if .Values.tlsCommunication.verifySsl }} diff --git a/deploy/charts/checkmk/values.yaml b/deploy/charts/checkmk/values.yaml index be0cea6..e11a478 100644 --- a/deploy/charts/checkmk/values.yaml +++ b/deploy/charts/checkmk/values.yaml @@ -282,6 +282,8 @@ nodeCollector: cpu: 150m memory: 200Mi + checkmkAgentTimeout: 5 + # the machine sections collector can collect monitoring information for network interfaces of the underlying node. # this means that the '/sys' directory of the node will be mounted into the container. # the pod security policy is adjusted accordingly. diff --git a/src/checkmk_kube_agent/send_metrics.py b/src/checkmk_kube_agent/send_metrics.py index 19d25b1..53375cc 100644 --- a/src/checkmk_kube_agent/send_metrics.py +++ b/src/checkmk_kube_agent/send_metrics.py @@ -301,7 +301,7 @@ def parse_arguments(argv: Sequence[str]) -> argparse.Namespace: help="Collector log level.", ) parser.add_argument( - "--agent-timeout", + "--checkmk-agent-timeout", type=int, help="Checkmk Agent execution timeout in seconds", ) @@ -311,7 +311,7 @@ def parse_arguments(argv: Sequence[str]) -> argparse.Namespace: max_retries=10, polling_interval=60, ca_cert="/etc/ca-certificates/checkmk-ca-cert.pem", - agent_timeout=5, + checkmk_agent_timeout=5, ) return parser.parse_args(argv) @@ -322,7 +322,7 @@ def container_metrics_worker( cluster_collector_base_url: Url, headers: RequestHeaders, verify: SslVerify, - args: argparse.Namespace, # pylint: disable=unused-argument + _args: argparse.Namespace, ) -> None: # pragma: no cover """ Query cadvisor api, send metrics to cluster collector