Update to check for multiple etcd pods with IP address conflicts

Check for more than one etcd IP address conflict in more than one pod. This is an unsupported condition that cannot be remediated. Fix unit tests to reflect the actaul etcd pod specs, by only containing one IP address per pod.
Azure · Jul 5, 2023 · fa0f972 · fa0f972
1 parent 84b0184
commit fa0f972
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 47 deletions.
diff --git a/pkg/frontend/admin_openshiftcluster_etcdrecovery.go b/pkg/frontend/admin_openshiftcluster_etcdrecovery.go
@@ -30,10 +30,8 @@ func (f *frontend) postAdminOpenShiftClusterEtcdRecovery(w http.ResponseWriter,
 }
 
 func (f *frontend) _postAdminOpenShiftClusterEtcdRecovery(ctx context.Context, r *http.Request, log *logrus.Entry) error {
-	var err error
 	resType, resName, resGroupName := chi.URLParam(r, "resourceType"), chi.URLParam(r, "resourceName"), chi.URLParam(r, "resourceGroupName")
 	resourceID := strings.TrimPrefix(r.URL.Path, "/admin")
-	groupKind := "Etcd"
 
 	doc, err := f.dbOpenShiftClusters.Get(ctx, resourceID)
 	switch {
@@ -47,7 +45,7 @@ func (f *frontend) _postAdminOpenShiftClusterEtcdRecovery(ctx context.Context, r
 		return err
 	}
 
-	gvr, err := kubeActions.ResolveGVR(groupKind)
+	gvr, err := kubeActions.ResolveGVR("Etcd")
 	if err != nil {
 		return err
 	}

diff --git a/pkg/frontend/fixetcd.go b/pkg/frontend/fixetcd.go
@@ -587,20 +587,36 @@ func adjustTimeout(ctx context.Context, log *logrus.Entry) time.Duration {
 
 // comparePodEnvToIp compares the etcd container's environment variables to the pod's actual IP address
 func comparePodEnvToIP(log *logrus.Entry, pods *corev1.PodList) (*degradedEtcd, error) {
-	de := &degradedEtcd{}
+	degradedEtcds := []degradedEtcd{}
 	for _, p := range pods.Items {
-		etcdIP := ipFromEnv(p.Spec.Containers, p.Name)
-		for _, ip := range p.Status.PodIPs {
-			if ip.IP != etcdIP && etcdIP != "" {
-				log.Infof("Found conflicting IPs for etcd Pod %s: %s!=%s", p.Name, ip.IP, etcdIP)
-				de.Node = strings.ReplaceAll(p.Name, "etcd-", "")
-				de.Pod = p.Name
-				de.NewIP = ip.IP
-				de.OldIP = etcdIP
+		envIP := ipFromEnv(p.Spec.Containers, p.Name)
+		for _, podIP := range p.Status.PodIPs {
+			if podIP.IP != envIP && envIP != "" {
+				log.Infof("Found conflicting IPs for etcd Pod %s: %s!=%s", p.Name, podIP.IP, envIP)
+				degradedEtcds = append(degradedEtcds, degradedEtcd{
+					Node:  strings.ReplaceAll(p.Name, "etcd-", ""),
+					Pod:   p.Name,
+					NewIP: podIP.IP,
+					OldIP: envIP,
+				})
+				break
 			}
 		}
 	}
 
+	// Check for multiple etcd pods with IP address conflicts
+	var de *degradedEtcd
+	if len(degradedEtcds) > 1 {
+		return nil, fmt.Errorf("found multiple etcd pods with conflicting IP addresses, only one degraded etcd is supported, unable to recover. Conflicting IPs found: %v", degradedEtcds)
+		// happens if the env variables are empty, check statuses next
+	} else if len(degradedEtcds) == 0 {
+		de = &degradedEtcd{}
+	} else {
+		// array is no longer needed
+		de = &degradedEtcds[0]
+	}
+
+	// TODO check statuses in addition to conflict, currently statuses are only checked if the env variable is equal to ""
 	// If no conflict is found a recent IP change may still be causing an issue
 	// Sometimes etcd can recovery the deployment itself, however there is still a data directory with the previous member's IP address present causing a failure
 	// This can still be remediated by relying on the pod statuses
@@ -626,9 +642,9 @@ func ipFromEnv(containers []corev1.Container, podName string) string {
 		if c.Name == "etcd" {
 			for _, e := range c.Env {
 				// The environment variable that contains etcd's IP address has the following naming convention
-				// NODE_cluster_name_c4j2v_master_0_IP
+				// NODE_cluster_name_infra_ID_master_0_IP
 				// while the pod looks like this
-				// etcd-cluster-name-c4j2v-master-0
+				// etcd-cluster-name-infra-id-master-0
 				// To find the pod's IP address by variable name we use the pod's name
 				envName := strings.ReplaceAll(strings.ReplaceAll(podName, "-", "_"), "etcd_", "NODE_")
 				if e.Name == fmt.Sprintf("%s_IP", envName) {
@@ -637,6 +653,7 @@ func ipFromEnv(containers []corev1.Container, podName string) string {
 			}
 		}
 	}
+
 	return ""
 }
 
@@ -660,7 +677,7 @@ func findCrashloopingPods(log *logrus.Entry, pods *corev1.PodList) (*corev1.Pod,
 		for _, c := range crashingPods.Items {
 			names = append(names, c.Name)
 		}
-		return nil, fmt.Errorf("only a single degraded quorum is supported, more than one not Ready etcd pods were found: %v", names)
+		return nil, fmt.Errorf("only a single degraded etcd pod can can be recovered from, more than one NotReady etcd pods were found: %v", names)
 	} else if len(crashingPods.Items) == 0 {
 		return nil, errors.New("no etcd pod's were found in a CrashLoopBackOff state, unable to remediate etcd deployment")
 	}

diff --git a/pkg/frontend/fixetcd_test.go b/pkg/frontend/fixetcd_test.go
@@ -25,8 +25,8 @@ import (
 
 const degradedNode = "master-2"
 
-// TODO fix tests
 func TestFixEtcd(t *testing.T) {
+	// TODO inject timeout duration
 	ctx := context.WithValue(context.Background(), ctxKey, "TRUE")
 
 	const (
@@ -192,7 +192,7 @@ func TestFixEtcd(t *testing.T) {
 		},
 		{
 			name:    "fail: Multiple degraded etcd instances scenario",
-			wantErr: "only a single degraded quorum is supported, more than one not Ready etcd pods were found: [etcd-cluster-zfsbk-master-2 etcd etcd]",
+			wantErr: "only a single degraded etcd pod can can be recovered from, more than one NotReady etcd pods were found: [etcd-cluster-zfsbk-master-0 etcd-cluster-zfsbk-master-1 etcd-cluster-zfsbk-master-2]",
 			pods:    newDegradedPods(doc, true, true),
 			mocks: func(tt *test, t *testing.T, ti *testInfra, k *mock_adminactions.MockKubeActions, pods *corev1.PodList) {
 				buf := &bytes.Buffer{}
@@ -472,13 +472,22 @@ func buildNodeName(doc *api.OpenShiftClusterDocument, node string) string {
 	return c + "-" + node
 }
 
+// TODO fix degraded pods to not create multiple conflicts every time
 func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv bool) *corev1.PodList {
 	var (
-		nodeName  = buildNodeName(doc, degradedNode)
-		master0IP = "10.0.0.1"
-		master1IP = "10.0.0.2"
-		master2IP = "10.0.0.3"
+		degradedNodeMaster2 = buildNodeName(doc, degradedNode)
+		nodeMaster0         = buildNodeName(doc, "master-0")
+		nodeMaster1         = buildNodeName(doc, "master-1")
 	)
+	const (
+		master0IP        = "10.0.0.1"
+		master1IP        = "10.0.0.2"
+		master2IP        = "10.0.0.3"
+		master2ChangedIP = "10.0.0.9"
+	)
+
+	// Used to test scenario when etcd's env vars are empty, or there is no conflict found
+	// then statuses will be tests
 	envs := []corev1.EnvVar{
 		{
 			Name:  "NODE_" + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_0_IP",
@@ -489,14 +498,10 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
 			Value: master1IP,
 		},
 		{
-			// degraded pod
 			Name:  "NODE_" + doc.OpenShiftCluster.Name + "_" + doc.OpenShiftCluster.Properties.InfraID + "_master_2_IP",
 			Value: master2IP,
 		},
 	}
-
-	// Used to test scenario when etcd's env vars are empty, or there is no conflict found
-	// then statuses will be tests
 	if emptyEnv {
 		envs = []corev1.EnvVar{}
 	}
@@ -515,39 +520,33 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
 		},
 	}
 
-	etcd2Statuses := []corev1.ContainerStatus{}
+	statuses := []corev1.ContainerStatus{}
 	if multiDegraded {
-		etcd2Statuses = badStatus
+		statuses = badStatus
 	}
 
 	return &corev1.PodList{
 		TypeMeta: metav1.TypeMeta{
 			Kind: "Etcd",
 		},
 		Items: []corev1.Pod{
-			// IP mismatch container
+			// healthy pod
 			{
 				TypeMeta: metav1.TypeMeta{},
 				ObjectMeta: metav1.ObjectMeta{
-					Name:      "etcd-" + nodeName,
+					Name:      "etcd-" + nodeMaster0,
 					Namespace: namespaceEtcds,
 				},
 				Status: corev1.PodStatus{
-					ContainerStatuses: badStatus,
+					ContainerStatuses: statuses,
 					PodIPs: []corev1.PodIP{
 						{
 							IP: master0IP,
 						},
-						{
-							IP: master1IP,
-						},
-						{
-							IP: master2IP,
-						},
 					},
 				},
 				Spec: corev1.PodSpec{
-					NodeName: nodeName,
+					NodeName: nodeMaster0,
 					Containers: []corev1.Container{
 						{
 							Name: "etcd",
@@ -556,18 +555,23 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
 					},
 				},
 			},
+			// healthy pod
 			{
 				TypeMeta: metav1.TypeMeta{},
 				ObjectMeta: metav1.ObjectMeta{
-					Name:      "etcd",
+					Name:      "etcd-" + nodeMaster1,
 					Namespace: namespaceEtcds,
 				},
 				Status: corev1.PodStatus{
-					ContainerStatuses: etcd2Statuses,
+					ContainerStatuses: statuses,
+					PodIPs: []corev1.PodIP{
+						{
+							IP: master1IP,
+						},
+					},
 				},
 				Spec: corev1.PodSpec{
-					NodeName: "master-1",
-					// healthy container
+					NodeName: nodeMaster1,
 					Containers: []corev1.Container{
 						{
 							Name: "etcd",
@@ -576,17 +580,23 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
 					},
 				},
 			},
+			// degraded pod
 			{
 				TypeMeta: metav1.TypeMeta{},
 				ObjectMeta: metav1.ObjectMeta{
-					Name: "etcd",
+					Name:      "etcd-" + degradedNodeMaster2,
+					Namespace: namespaceEtcds,
 				},
 				Status: corev1.PodStatus{
-					ContainerStatuses: etcd2Statuses,
+					ContainerStatuses: badStatus,
+					PodIPs: []corev1.PodIP{
+						{
+							IP: master2ChangedIP,
+						},
+					},
 				},
 				Spec: corev1.PodSpec{
-					NodeName: "master-0",
-					// healthy container
+					NodeName: degradedNodeMaster2,
 					Containers: []corev1.Container{
 						{
 							Name: "etcd",
@@ -595,6 +605,5 @@ func newDegradedPods(doc *api.OpenShiftClusterDocument, multiDegraded, emptyEnv
 					},
 				},
 			},
-		},
-	}
+		}}
 }