Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-14346: Fix when DNS operator reports Degraded #373

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions pkg/operator/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/openshift/cluster-dns-operator/pkg/manifests"
operatorconfig "github.com/openshift/cluster-dns-operator/pkg/operator/config"
retryable "github.com/openshift/cluster-dns-operator/pkg/util/retryableerror"
"github.com/openshift/cluster-dns-operator/pkg/util/slice"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -119,11 +120,11 @@ func New(mgr manager.Manager, config operatorconfig.Config) (controller.Controll
DeleteFunc: func(e event.DeleteEvent) bool { return nodePredicate(e.Object) },
UpdateFunc: func(e event.UpdateEvent) bool {
old := e.ObjectOld.(*corev1.Node)
new := e.ObjectNew.(*corev1.Node)
if ignoreNodeForTopologyAwareHints(old) != ignoreNodeForTopologyAwareHints(new) {
nu := e.ObjectNew.(*corev1.Node)
if ignoreNodeForTopologyAwareHints(old) != ignoreNodeForTopologyAwareHints(nu) {
return true
}
if !ignoreNodeForTopologyAwareHints(new) && nodeIsValidForTopologyAwareHints(old) != nodeIsValidForTopologyAwareHints(new) {
if !ignoreNodeForTopologyAwareHints(nu) && nodeIsValidForTopologyAwareHints(old) != nodeIsValidForTopologyAwareHints(nu) {
return true
}
return false
Expand Down Expand Up @@ -240,6 +241,11 @@ func (r *reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
} else {
// Handle everything else.
if err := r.ensureDNS(dns, &result); err != nil {
switch e := err.(type) {
case retryable.Error:
logrus.Error(e, "got retryable error; requeueing", "after", e.After())
return reconcile.Result{RequeueAfter: e.After()}, nil
}
errs = append(errs, fmt.Errorf("failed to ensure dns %s: %v", dns.Name, err))
} else if err := r.ensureExternalNameForOpenshiftService(); err != nil {
errs = append(errs, fmt.Errorf("failed to ensure external name for openshift service: %v", err))
Expand Down
210 changes: 153 additions & 57 deletions pkg/operator/controller/dns_status.go

Large diffs are not rendered by default.

456 changes: 325 additions & 131 deletions pkg/operator/controller/dns_status_test.go

Large diffs are not rendered by default.

64 changes: 40 additions & 24 deletions pkg/operator/controller/status/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (r *reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
co.Status.Conditions = mergeConditions(co.Status.Conditions,
computeOperatorAvailableCondition(state.haveDNS, &state.dns),
operatorProgressingCondition,
computeOperatorDegradedCondition(state.haveDNS, &state.dns),
computeOperatorDegradedCondition(state.haveDNS, &state.dns, oldVersions, newVersions, curVersions),
)
co.Status.Versions = computeOperatorStatusVersions(curVersions)
co.Status.Conditions = mergeConditions(co.Status.Conditions, computeOperatorUpgradeableCondition(&state.dns))
Expand Down Expand Up @@ -405,7 +405,9 @@ func computeOperatorUpgradeableCondition(dns *operatorv1.DNS) configv1.ClusterOp
}

// computeOperatorDegradedCondition computes the operator's current Degraded status state.
func computeOperatorDegradedCondition(haveDNS bool, dns *operatorv1.DNS) configv1.ClusterOperatorStatusCondition {
func computeOperatorDegradedCondition(haveDNS bool, dns *operatorv1.DNS, oldVersions, newVersions, curVersions map[string]string) configv1.ClusterOperatorStatusCondition {
var messages []string

if !haveDNS {
return configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorDegraded,
Expand All @@ -415,18 +417,23 @@ func computeOperatorDegradedCondition(haveDNS bool, dns *operatorv1.DNS) configv
}
}

var degraded bool
for _, cond := range dns.Status.Conditions {
if cond.Type == operatorv1.OperatorStatusTypeDegraded && cond.Status == operatorv1.ConditionTrue {
degraded = true
// See OCPBUGS-14346. If the operator is upgrading, we can't consider it as degraded.
upgrading, _ := isUpgrading(curVersions, oldVersions, newVersions)
if !upgrading {
var degraded bool
for _, cond := range dns.Status.Conditions {
if cond.Type == operatorv1.OperatorStatusTypeDegraded && cond.Status == operatorv1.ConditionTrue {
degraded = true
messages = append(messages, cond.Message)
}
}
}
if degraded {
return configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionTrue,
Reason: "DNSDegraded",
Message: fmt.Sprintf("DNS %s is degraded", dns.Name),
if degraded {
return configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionTrue,
Reason: "DNSDegraded",
Message: fmt.Sprintf("DNS %s is degraded: %s", dns.Name, strings.Join(messages, "\n")),
}
}
}
return configv1.ClusterOperatorStatusCondition{
Expand Down Expand Up @@ -472,24 +479,17 @@ func computeOperatorProgressingCondition(haveDNS bool, dns *operatorv1.DNS, oldV
}
}

upgrading := false
for name, curVersion := range curVersions {
if oldVersion, ok := oldVersions[name]; ok && oldVersion != curVersion {
messages = append(messages, fmt.Sprintf("Upgraded %s to %q.", name, curVersion))
}
if newVersion, ok := newVersions[name]; ok && curVersion != newVersion {
upgrading = true
messages = append(messages, fmt.Sprintf("Upgrading %s to %q.", name, newVersion))
}
}
// If the operator is upgrading, note it as a Progressing reason and add the upgrading messages
upgrading, upgradingMessages := isUpgrading(curVersions, oldVersions, newVersions)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially, I thought this meant "Cluster is Upgrading". Seems like that isn't the case, which I think you are aware of, based on the code comment above. This looks to based on the version of:

  1. Operator
  2. CoreDNS Pods
  3. KubeRBac roxy
  4. OpenshiftCLI (I'm not sure, but I think this is an image that the NodeResolver daemon set uses)

I don't I have any issues with that, but it sounds like it's been implied that this is resolving OCPBUGS-14346, but I wonder if it's worth pointing out that it's only part of the problem.

AFAIK, the degrade status comes from two types of situations during upgrade:

  1. DNS-related pods (the four mentioned above) are rolling out due to new versions
  2. The nodes are rolling out (near the end of the upgrade)

Both situations result in DNS pods going unavailable. Unless I'm misunderstanding, this isUpgrading doesn't account for the node rebooting/rollout portion of the upgrade. Was the intention for IsUpgrading to also protect against degraded status during the node rollout? I am misunderstanding the problem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Afaik, the concern is whether the operator is upgrading, not whether the cluster is upgrading.

if upgrading {
status = configv1.ConditionTrue
messages = append(messages, strings.Join(upgradingMessages, "And "))
progressingReasons = append(progressingReasons, "Upgrading")
}

if len(progressingReasons) != 0 {
progressingCondition.Status = status
progressingCondition.Reason = strings.Join(progressingReasons, "And")
progressingCondition.Reason = strings.Join(progressingReasons, "And ")
progressingCondition.Message = strings.Join(messages, "\n")
} else {
progressingCondition.Status = configv1.ConditionFalse
Expand All @@ -500,6 +500,22 @@ func computeOperatorProgressingCondition(haveDNS bool, dns *operatorv1.DNS, oldV
return progressingCondition
}

func isUpgrading(curVersions, oldVersions, newVersions map[string]string) (bool, []string) {
var messages []string
upgrading := false

for name, curVersion := range curVersions {
if oldVersion, ok := oldVersions[name]; ok && oldVersion != curVersion {
messages = append(messages, fmt.Sprintf("Upgraded %s to %q.", name, curVersion))
}
if newVersion, ok := newVersions[name]; ok && curVersion != newVersion {
upgrading = true
messages = append(messages, fmt.Sprintf("Upgrading %s to %q.", name, newVersion))
}
}
return upgrading, messages
}

// computeOldVersions returns a map of operand name to version computed from the
// given clusteroperator status.
func computeOldVersions(oldVersions []configv1.OperandVersion) map[string]string {
Expand Down
154 changes: 154 additions & 0 deletions pkg/operator/controller/status/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -631,3 +631,157 @@ func TestComputeCurrentVersions(t *testing.T) {
}
}
}

func TestComputeOperatorDegradedCondition(t *testing.T) {
type versions struct {
operator, operand string
}

testCases := []struct {
description string
dnsMissing bool
oldVersions versions
newVersions versions
curVersions versions
expectDegraded configv1.ConditionStatus
}{
{
description: "dns does not exist",
dnsMissing: true,
expectDegraded: configv1.ConditionTrue,
},
{
description: "versions match",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v1", "dns-v1"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operator upgrade in progress",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v1"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operand upgrade in progress",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v1", "dns-v2"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operator and operand upgrade in progress",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v2"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operator upgrade done",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v1"},
curVersions: versions{"v2", "dns-v1"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operand upgrade done",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v1", "dns-v2"},
curVersions: versions{"v1", "dns-v2"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operator and operand upgrade done",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v2"},
curVersions: versions{"v2", "dns-v2"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operator upgrade in progress, operand upgrade done",
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v2"},
curVersions: versions{"v1", "dns-v2"},
expectDegraded: configv1.ConditionFalse,
},
{
description: "operator upgrade in progress but no dns",
dnsMissing: true,
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v1"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionTrue,
},
{
description: "operand upgrade in progress, but no dns",
dnsMissing: true,
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v1", "dns-v2"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionTrue,
},
{
description: "operator and operand upgrade in progress, but no dns",
dnsMissing: true,
oldVersions: versions{"v1", "dns-v1"},
newVersions: versions{"v2", "dns-v2"},
curVersions: versions{"v1", "dns-v1"},
expectDegraded: configv1.ConditionTrue,
},
}

for _, tc := range testCases {
var (
haveDNS bool
dns *operatorv1.DNS
)
if !tc.dnsMissing {
haveDNS = true
degradedStatus := operatorv1.ConditionFalse
if tc.dnsMissing {
degradedStatus = operatorv1.ConditionTrue
}
dns = &operatorv1.DNS{
Status: operatorv1.DNSStatus{
Conditions: []operatorv1.OperatorCondition{{
Type: operatorv1.OperatorStatusTypeDegraded,
Status: degradedStatus,
}},
},
}
}
oldVersions := map[string]string{
OperatorVersionName: tc.oldVersions.operator,
CoreDNSVersionName: tc.oldVersions.operand,
OpenshiftCLIVersionName: tc.oldVersions.operand,
KubeRBACProxyName: tc.oldVersions.operand,
}
newVersions := map[string]string{
OperatorVersionName: tc.newVersions.operator,
CoreDNSVersionName: tc.newVersions.operand,
OpenshiftCLIVersionName: tc.newVersions.operand,
KubeRBACProxyName: tc.newVersions.operand,
}
curVersions := map[string]string{
OperatorVersionName: tc.curVersions.operator,
CoreDNSVersionName: tc.curVersions.operand,
OpenshiftCLIVersionName: tc.curVersions.operand,
KubeRBACProxyName: tc.curVersions.operand,
}

expected := configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorDegraded,
Status: tc.expectDegraded,
}

actual := computeOperatorDegradedCondition(haveDNS, dns, oldVersions, newVersions, curVersions)
conditionsCmpOpts := []cmp.Option{
cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime", "Reason", "Message"),
}
if !cmp.Equal(actual, expected, conditionsCmpOpts...) {
t.Errorf("%q: expected %#v, got %#v", tc.description, expected, actual)
}
}
}
Loading