diff --git a/pkg/operator/controllers/machinehealthcheck/doc.go b/pkg/operator/controllers/machinehealthcheck/doc.go index c8a4e5a369b..2bc51cee329 100644 --- a/pkg/operator/controllers/machinehealthcheck/doc.go +++ b/pkg/operator/controllers/machinehealthcheck/doc.go @@ -19,11 +19,11 @@ aro.machinehealthcheck.managed - When set to false, the controller will attempt to remove the aro-machinehealthcheck CR and the MHC Remediation alert from the cluster. This should effectively disable the MHC we deploy and prevent the automatic reconciliation of nodes. - When set to true, the controller will deploy/overwrite the aro-machinehealthcheck CR and the MHC Remediation alert to the cluster. - This enables the cluster to self heal when at most 1 worker node goes not ready for at least 5 minutes and alert when remediation + This enables the cluster to self heal when at most 1 worker node goes not ready for at least 15 minutes and alert when remediation occurs 2 or more times within an hour. The aro-machinehealth check is configured in a way that if 2 worker nodes go not ready it will not take any action. More information about how the MHC works can be found here: -https://docs.openshift.com/container-platform/4.9/machine_management/deploying-machine-health-checks.html +https://docs.openshift.com/container-platform/4.12/machine_management/deploying-machine-health-checks.html */ diff --git a/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml b/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml index 86ff3f3dd85..33f5af6dad2 100644 --- a/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml +++ b/pkg/operator/controllers/machinehealthcheck/staticresources/machinehealthcheck.yaml @@ -15,10 +15,10 @@ spec: operator: Exists unhealthyConditions: - type: "Ready" - timeout: "300s" + timeout: "15m" status: "False" - type: "Ready" - timeout: "300s" + timeout: "15m" status: "Unknown" maxUnhealthy: "1" - nodeStartupTimeout: "20m" + nodeStartupTimeout: "25m"