Skip to content

Commit

Permalink
Update to check for multiple etcd pods with IP address conflicts
Browse files Browse the repository at this point in the history
Check for more than one etcd IP address conflict in more than one pod.
This is an unsupported condition that cannot be remediated.
Fix unit tests to reflect the actaul etcd pod specs, by only containing
one IP address per pod.
  • Loading branch information
s-fairchild committed Jul 5, 2023
1 parent 84b0184 commit fa0f972
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 47 deletions.
4 changes: 1 addition & 3 deletions pkg/frontend/admin_openshiftcluster_etcdrecovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,8 @@ func (f *frontend) postAdminOpenShiftClusterEtcdRecovery(w http.ResponseWriter,
}

func (f *frontend) _postAdminOpenShiftClusterEtcdRecovery(ctx context.Context, r *http.Request, log *logrus.Entry) error {
var err error
resType, resName, resGroupName := chi.URLParam(r, "resourceType"), chi.URLParam(r, "resourceName"), chi.URLParam(r, "resourceGroupName")
resourceID := strings.TrimPrefix(r.URL.Path, "/admin")
groupKind := "Etcd"

doc, err := f.dbOpenShiftClusters.Get(ctx, resourceID)
switch {
Expand All @@ -47,7 +45,7 @@ func (f *frontend) _postAdminOpenShiftClusterEtcdRecovery(ctx context.Context, r
return err
}

gvr, err := kubeActions.ResolveGVR(groupKind)
gvr, err := kubeActions.ResolveGVR("Etcd")
if err != nil {
return err
}
Expand Down
41 changes: 29 additions & 12 deletions pkg/frontend/fixetcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -587,20 +587,36 @@ func adjustTimeout(ctx context.Context, log *logrus.Entry) time.Duration {

// comparePodEnvToIp compares the etcd container's environment variables to the pod's actual IP address
func comparePodEnvToIP(log *logrus.Entry, pods *corev1.PodList) (*degradedEtcd, error) {
de := &degradedEtcd{}
degradedEtcds := []degradedEtcd{}
for _, p := range pods.Items {
etcdIP := ipFromEnv(p.Spec.Containers, p.Name)
for _, ip := range p.Status.PodIPs {
if ip.IP != etcdIP && etcdIP != "" {
log.Infof("Found conflicting IPs for etcd Pod %s: %s!=%s", p.Name, ip.IP, etcdIP)
de.Node = strings.ReplaceAll(p.Name, "etcd-", "")
de.Pod = p.Name
de.NewIP = ip.IP
de.OldIP = etcdIP
envIP := ipFromEnv(p.Spec.Containers, p.Name)
for _, podIP := range p.Status.PodIPs {
if podIP.IP != envIP && envIP != "" {
log.Infof("Found conflicting IPs for etcd Pod %s: %s!=%s", p.Name, podIP.IP, envIP)
degradedEtcds = append(degradedEtcds, degradedEtcd{
Node: strings.ReplaceAll(p.Name, "etcd-", ""),
Pod: p.Name,
NewIP: podIP.IP,
OldIP: envIP,
})
break
}
}
}

// Check for multiple etcd pods with IP address conflicts
var de *degradedEtcd
if len(degradedEtcds) > 1 {
return nil, fmt.Errorf("found multiple etcd pods with conflicting IP addresses, only one degraded etcd is supported, unable to recover. Conflicting IPs found: %v", degradedEtcds)
// happens if the env variables are empty, check statuses next
} else if len(degradedEtcds) == 0 {
de = &degradedEtcd{}
} else {
// array is no longer needed
de = &degradedEtcds[0]
}

// TODO check statuses in addition to conflict, currently statuses are only checked if the env variable is equal to ""
// If no conflict is found a recent IP change may still be causing an issue
// Sometimes etcd can recovery the deployment itself, however there is still a data directory with the previous member's IP address present causing a failure
// This can still be remediated by relying on the pod statuses
Expand All @@ -626,9 +642,9 @@ func ipFromEnv(containers []corev1.Container, podName string) string {
if c.Name == "etcd" {
for _, e := range c.Env {
// The environment variable that contains etcd's IP address has the following naming convention
// NODE_cluster_name_c4j2v_master_0_IP
// NODE_cluster_name_infra_ID_master_0_IP
// while the pod looks like this
// etcd-cluster-name-c4j2v-master-0
// etcd-cluster-name-infra-id-master-0
// To find the pod's IP address by variable name we use the pod's name
envName := strings.ReplaceAll(strings.ReplaceAll(podName, "-", "_"), "etcd_", "NODE_")
if e.Name == fmt.Sprintf("%s_IP", envName) {
Expand All @@ -637,6 +653,7 @@ func ipFromEnv(containers []corev1.Container, podName string) string {
}
}
}

return ""
}

Expand All @@ -660,7 +677,7 @@ func findCrashloopingPods(log *logrus.Entry, pods *corev1.PodList) (*corev1.Pod,
for _, c := range crashingPods.Items {
names = append(names, c.Name)
}
return nil, fmt.Errorf("only a single degraded quorum is supported, more than one not Ready etcd pods were found: %v", names)
return nil, fmt.Errorf("only a single degraded etcd pod can can be recovered from, more than one NotReady etcd pods were found: %v", names)
} else if len(crashingPods.Items) == 0 {
return nil, errors.New("no etcd pod's were found in a CrashLoopBackOff state, unable to remediate etcd deployment")
}
Expand Down
73 changes: 41 additions & 32 deletions pkg/frontend/fixetcd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ import (

const degradedNode = "master-2"

// TODO fix tests
func TestFixEtcd(t *testing.T) {
// TODO inject timeout duration
ctx := context.WithValue(context.Background(), ctxKey, "TRUE")

const (
Expand Down Expand Up @@ -192,7 +192,7 @@ func TestFixEtcd(t *testing.T) {
},
{
name: "fail: Multiple degraded etcd instances scenario",
wantErr: "only a single degraded quorum is supported, more than one not Ready etcd pods were found: [etcd-cluster-zfsbk-master-2 etcd etcd]",
wantErr: "only a single degraded etcd pod can can be recovered from, more than one NotReady etcd pods were found: [etcd-cluster-zfsbk-master-0 etcd-cluster-zfsbk-master-1 etcd-cluster-zfsbk-master-2]",
pods: newDegradedPods(doc, true, true),
mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList) {
buf := &bytes.Buffer{}
Expand Down Expand Up @@ -472,13 +472,22 @@ func buildNodeName(doc *api.OpenShiftClusterDocument, node string) string {
return c + "-" + node
}

// TODO fix degraded pods to not create multiple conflicts every time
func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv bool) *corev1.PodList {
var (
nodeName = buildNodeName(doc, degradedNode)
master0IP = "10.0.0.1"
master1IP = "10.0.0.2"
master2IP = "10.0.0.3"
degradedNodeMaster2 = buildNodeName(doc, degradedNode)
nodeMaster0 = buildNodeName(doc, "master-0")
nodeMaster1 = buildNodeName(doc, "master-1")
)
const (
master0IP = "10.0.0.1"
master1IP = "10.0.0.2"
master2IP = "10.0.0.3"
master2ChangedIP = "10.0.0.9"
)

// Used to test scenario when etcd's env vars are empty, or there is no conflict found
// then statuses will be tests
envs := []corev1.EnvVar{
{
Name: "NODE_" + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_0_IP",
Expand All @@ -489,14 +498,10 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
Value: master1IP,
},
{
// degraded pod
Name: "NODE_" + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_2_IP",
Value: master2IP,
},
}

// Used to test scenario when etcd's env vars are empty, or there is no conflict found
// then statuses will be tests
if emptyEnv {
envs = []corev1.EnvVar{}
}
Expand All @@ -515,39 +520,33 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
},
}

etcd2Statuses := []corev1.ContainerStatus{}
statuses := []corev1.ContainerStatus{}
if multiDegraded {
etcd2Statuses = badStatus
statuses = badStatus
}

return &corev1.PodList{
TypeMeta: metav1.TypeMeta{
Kind: "Etcd",
},
Items: []corev1.Pod{
// IP mismatch container
// healthy pod
{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{
Name: "etcd-" + nodeName,
Name: "etcd-" + nodeMaster0,
Namespace: namespaceEtcds,
},
Status: corev1.PodStatus{
ContainerStatuses: badStatus,
ContainerStatuses: statuses,
PodIPs: []corev1.PodIP{
{
IP: master0IP,
},
{
IP: master1IP,
},
{
IP: master2IP,
},
},
},
Spec: corev1.PodSpec{
NodeName: nodeName,
NodeName: nodeMaster0,
Containers: []corev1.Container{
{
Name: "etcd",
Expand All @@ -556,18 +555,23 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
},
},
},
// healthy pod
{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{
Name: "etcd",
Name: "etcd-" + nodeMaster1,
Namespace: namespaceEtcds,
},
Status: corev1.PodStatus{
ContainerStatuses: etcd2Statuses,
ContainerStatuses: statuses,
PodIPs: []corev1.PodIP{
{
IP: master1IP,
},
},
},
Spec: corev1.PodSpec{
NodeName: "master-1",
// healthy container
NodeName: nodeMaster1,
Containers: []corev1.Container{
{
Name: "etcd",
Expand All @@ -576,17 +580,23 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
},
},
},
// degraded pod
{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{
Name: "etcd",
Name: "etcd-" + degradedNodeMaster2,
Namespace: namespaceEtcds,
},
Status: corev1.PodStatus{
ContainerStatuses: etcd2Statuses,
ContainerStatuses: badStatus,
PodIPs: []corev1.PodIP{
{
IP: master2ChangedIP,
},
},
},
Spec: corev1.PodSpec{
NodeName: "master-0",
// healthy container
NodeName: degradedNodeMaster2,
Containers: []corev1.Container{
{
Name: "etcd",
Expand All @@ -595,6 +605,5 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
},
},
},
},
}
}}
}

0 comments on commit fa0f972

Please sign in to comment.