From 1479169a080efc58222c6f0ae7364a9dc248f353 Mon Sep 17 00:00:00 2001 From: Spencer Amann Date: Thu, 31 Aug 2023 23:38:29 +0000 Subject: [PATCH] increase machine health check node unready timeout to 15m (#3133) * increase machine health check node unready timeout to 15m * update mhc docs * increase machine health check node startup timeout to 25m --- pkg/operator/controllers/machinehealthcheck/doc.go | 4 ++-- .../staticresources/machinehealthcheck.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/operator/controllers/machinehealthcheck/doc.go b/pkg/operator/controllers/machinehealthcheck/doc.go index c8a4e5a369b..2bc51cee329 100644 --- a/pkg/operator/controllers/machinehealthcheck/doc.go +++ b/pkg/operator/controllers/machinehealthcheck/doc.go @@ -19,11 +19,11 @@ aro.machinehealthcheck.managed - When set to false, the controller will attempt to remove the aro-machinehealthcheck CR and the MHC Remediation alert from the cluster. This should effectively disable the MHC we deploy and prevent the automatic reconciliation of nodes. - When set to true, the controller will deploy/overwrite the aro-machinehealthcheck CR and the MHC Remediation alert to the cluster. - This enables the cluster to self heal when at most 1 worker node goes not ready for at least 5 minutes and alert when remediation + This enables the cluster to self heal when at most 1 worker node goes not ready for at least 15 minutes and alert when remediation occurs 2 or more times within an hour. The aro-machinehealth check is configured in a way that if 2 worker nodes go not ready it will not take any action. More information about how the MHC works can be found here: -https://docs.openshift.com/container-platform/4.9/machine_management/deploying-machine-health-checks.html +https://docs.openshift.com/container-platform/4.12/machine_management/deploying-machine-health-checks.html */ diff --git a/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml b/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml index 86ff3f3dd85..33f5af6dad2 100644 --- a/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml +++ b/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml @@ -15,10 +15,10 @@ spec: operator: Exists unhealthyConditions: - type: "Ready" - timeout: "300s" + timeout: "15m" status: "False" - type: "Ready" - timeout: "300s" + timeout: "15m" status: "Unknown" maxUnhealthy: "1" - nodeStartupTimeout: "20m" + nodeStartupTimeout: "25m"