Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,47 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
memberPromotedVotingTimeout, pollInterval)

})

g.It("should recover from etcd process crash", func() {
// Note: This test kills the etcd process/container on one node to simulate
// a process crash, testing Pacemaker's ability to detect and restart etcd
survivedNode := peerNode
g.GinkgoT().Printf("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n",
targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address)

g.By(fmt.Sprintf("Killing etcd process/container on %s", targetNode.Name))
// Try multiple methods to kill etcd - container kill, process kill, or service stop
_, err := util.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
"bash", "-c", "podman kill etcd 2>/dev/null || pkill -9 etcd 2>/dev/null || systemctl stop etcd 2>/dev/null || true")
o.Expect(err).To(o.BeNil(), "Expected to kill etcd process without command errors")

g.By("Waiting for Pacemaker to detect etcd failure and begin recovery")
// Give Pacemaker time to detect the failure and start recovery
time.Sleep(30 * time.Second)

g.By(fmt.Sprintf("Ensuring %s becomes leader and %s rejoins as learner", peerNode.Name, targetNode.Name))
validateEtcdRecoveryState(etcdClientFactory,
&survivedNode, true, false, // survivedNode expected started == true, learner == false
&targetNode, false, true, // targetNode expected started == false, learner == true
memberIsLeaderTimeout, pollInterval)

g.By(fmt.Sprintf("Ensuring %s rejoins as learner", targetNode.Name))
validateEtcdRecoveryState(etcdClientFactory,
&survivedNode, true, false, // survivedNode expected started == true, learner == false
&targetNode, true, true, // targetNode expected started == true, learner == true
memberRejoinedLearnerTimeout, pollInterval)

g.By(fmt.Sprintf("Ensuring %s is promoted back to voting member", targetNode.Name))
validateEtcdRecoveryState(etcdClientFactory,
&survivedNode, true, false, // survivedNode expected started == true, learner == false
&targetNode, true, false, // targetNode expected started == true, learner == false
memberPromotedVotingTimeout, pollInterval)

g.By("Ensuring etcd cluster operator is healthy after recovery")
o.Eventually(func() error {
return ensureEtcdOperatorHealthy(oc)
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy after recovery")
})
})

func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {
Expand Down