diff --git a/controllers/upgrade_information_collector.go b/controllers/upgrade_information_collector.go index a91686d..4437e1c 100644 --- a/controllers/upgrade_information_collector.go +++ b/controllers/upgrade_information_collector.go @@ -47,7 +47,7 @@ var poolsPausedDesc = prometheus.NewDesc( var jobStates = prometheus.NewDesc( MetricsNamespace+"_upgradejob_state", - "Returns the state of jobs in the cluster. 'pending', 'active', 'succeeded', or 'failed' are possible states.", + "Returns the state of jobs in the cluster. 'pending', 'active', 'succeeded', or 'failed' are possible states. Final states may have a reason.", []string{ "upgradejob", "start_after", @@ -56,6 +56,7 @@ var jobStates = prometheus.NewDesc( "desired_version_image", "desired_version_version", "state", + "reason", "matches_disruptive_hooks", }, nil, @@ -154,6 +155,7 @@ func (m *UpgradeInformationCollector) Collect(ch chan<- prometheus.Metric) { v.Image, v.Version, jobState(job), + jobStateReason(job), strconv.FormatBool(jobHasMatchingDisruptiveHook(job, jobsHooks)), ) } @@ -166,6 +168,20 @@ func boolToFloat64(b bool) float64 { return 0 } +// jobStateReason returns the reason for the current state of the job. +// All final states should have a reason. +func jobStateReason(job managedupgradev1beta1.UpgradeJob) string { + sc := apimeta.FindStatusCondition(job.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionSucceeded) + if sc != nil && sc.Status == metav1.ConditionTrue { + return sc.Reason + } + sf := apimeta.FindStatusCondition(job.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionFailed) + if sf != nil && sf.Status == metav1.ConditionTrue { + return sf.Reason + } + return "" +} + func jobState(job managedupgradev1beta1.UpgradeJob) string { if apimeta.IsStatusConditionTrue(job.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionSucceeded) { return "succeeded" diff --git a/controllers/upgrade_information_collector_test.go b/controllers/upgrade_information_collector_test.go index c412f4d..a08c1f0 100644 --- a/controllers/upgrade_information_collector_test.go +++ b/controllers/upgrade_information_collector_test.go @@ -198,6 +198,7 @@ func Test_ClusterUpgradingMetric(t *testing.T) { Status: metav1.ConditionTrue, }, { Type: managedupgradev1beta1.UpgradeJobConditionSucceeded, + Reason: managedupgradev1beta1.UpgradeJobReasonSkipped, Status: metav1.ConditionTrue, }, }, @@ -214,6 +215,7 @@ func Test_ClusterUpgradingMetric(t *testing.T) { Status: metav1.ConditionTrue, }, { Type: managedupgradev1beta1.UpgradeJobConditionFailed, + Reason: managedupgradev1beta1.UpgradeJobReasonHookFailed, Status: metav1.ConditionTrue, }, }, @@ -275,17 +277,17 @@ openshift_upgrade_controller_machine_config_pools_upgrading{pool="master"} %d openshift_upgrade_controller_machine_config_pools_upgrading{pool="worker"} %d openshift_upgrade_controller_machine_config_pools_upgrading{pool="paused1"} 0 openshift_upgrade_controller_machine_config_pools_upgrading{pool="paused2"} 0 -# HELP openshift_upgrade_controller_upgradejob_state Returns the state of jobs in the cluster. 'pending', 'active', 'succeeded', or 'failed' are possible states. +# HELP openshift_upgrade_controller_upgradejob_state Returns the state of jobs in the cluster. 'pending', 'active', 'succeeded', or 'failed' are possible states. Final states may have a reason. # TYPE openshift_upgrade_controller_upgradejob_state gauge -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="active",upgradejob="active"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="failed",upgradejob="failed"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="paused",upgradejob="paused"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="succeeded",upgradejob="succeeded"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="true",desired_version_image="quay.io/openshift-release-dev/ocp-release@sha256:26f6d10b18",desired_version_version="4.11.23",matches_disruptive_hooks="false",start_after="2020-01-20T20:00:00Z",start_before="2020-01-20T21:00:00Z",state="pending",upgradejob="pending"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",reason="",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="active",upgradejob="active"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",reason="HookFailed",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="failed",upgradejob="failed"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",reason="",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="paused",upgradejob="paused"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",reason="Skipped",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="succeeded",upgradejob="succeeded"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="true",desired_version_image="quay.io/openshift-release-dev/ocp-release@sha256:26f6d10b18",desired_version_version="4.11.23",matches_disruptive_hooks="false",reason="",start_after="2020-01-20T20:00:00Z",start_before="2020-01-20T21:00:00Z",state="pending",upgradejob="pending"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="true",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="pending",upgradejob="disruptive"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="true",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="pending",upgradejob="disruptive-unclaimed-next"} 1 -openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="pending",upgradejob="disruptive-claimed-next"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="true",reason="",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="pending",upgradejob="disruptive"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="true",reason="",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="pending",upgradejob="disruptive-unclaimed-next"} 1 +openshift_upgrade_controller_upgradejob_state{desired_version_force="false",desired_version_image="",desired_version_version="",matches_disruptive_hooks="false",reason="",start_after="0001-01-01T00:00:00Z",start_before="0001-01-01T00:00:00Z",state="pending",upgradejob="disruptive-claimed-next"} 1 ` return strings.NewReader( fmt.Sprintf(metrics, b2i(upgrading), b2i(masterUpgrading), b2i(workerUpgrading)),