diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index 2fe727bbd02c..2159d987e949 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -169,6 +169,47 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual memberPromotedVotingTimeout, pollInterval) }) + + g.It("should recover from etcd process crash", func() { + // Note: This test kills the etcd process/container on one node to simulate + // a process crash, testing Pacemaker's ability to detect and restart etcd + survivedNode := peerNode + g.GinkgoT().Printf("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n", + targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address) + + g.By(fmt.Sprintf("Killing etcd process/container on %s", targetNode.Name)) + // Try multiple methods to kill etcd - container kill, process kill, or service stop + _, err := util.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", + "bash", "-c", "podman kill etcd 2>/dev/null || pkill -9 etcd 2>/dev/null || systemctl stop etcd 2>/dev/null || true") + o.Expect(err).To(o.BeNil(), "Expected to kill etcd process without command errors") + + g.By("Waiting for Pacemaker to detect etcd failure and begin recovery") + // Give Pacemaker time to detect the failure and start recovery + time.Sleep(30 * time.Second) + + g.By(fmt.Sprintf("Ensuring %s becomes leader and %s rejoins as learner", peerNode.Name, targetNode.Name)) + validateEtcdRecoveryState(etcdClientFactory, + &survivedNode, true, false, // survivedNode expected started == true, learner == false + &targetNode, false, true, // targetNode expected started == false, learner == true + memberIsLeaderTimeout, pollInterval) + + g.By(fmt.Sprintf("Ensuring %s rejoins as learner", targetNode.Name)) + validateEtcdRecoveryState(etcdClientFactory, + &survivedNode, true, false, // survivedNode expected started == true, learner == false + &targetNode, true, true, // targetNode expected started == true, learner == true + memberRejoinedLearnerTimeout, pollInterval) + + g.By(fmt.Sprintf("Ensuring %s is promoted back to voting member", targetNode.Name)) + validateEtcdRecoveryState(etcdClientFactory, + &survivedNode, true, false, // survivedNode expected started == true, learner == false + &targetNode, true, false, // targetNode expected started == true, learner == false + memberPromotedVotingTimeout, pollInterval) + + g.By("Ensuring etcd cluster operator is healthy after recovery") + o.Eventually(func() error { + return ensureEtcdOperatorHealthy(oc) + }, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy after recovery") + }) }) func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {