Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ type hypervisorExtendedConfig struct {
HypervisorKnownHostsPath string
}

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive] Two Node with Fencing etcd recovery", func() {
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive][Serial] Two Node with Fencing etcd recovery", func() {
defer g.GinkgoRecover()

var (
Expand Down Expand Up @@ -234,6 +234,29 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
&nodeB, true, false, // member on node B expected started == true, learner == false
membersHealthyAfterDoubleReboot, pollInterval)
})

g.It("should recover from etcd process crash [Skipped:KnownIssue]", func() {
// Note: This test kills the etcd process/container on one node to simulate
// a process crash, testing Pacemaker's ability to detect and restart etcd
// Currently skipped due to OCPBUGS-59238: rapid podman-etcd restart fails on unpatched clusters
g.GinkgoT().Printf("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n",
targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address)

g.By(fmt.Sprintf("Killing etcd process/container on %s", targetNode.Name))
_, err := util.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
"bash", "-c", "podman kill etcd 2>/dev/null")
o.Expect(err).To(o.BeNil(), "Expected to kill etcd process without command errors")

g.By("Waiting for cluster to recover - both nodes become started voting members")
// Retry validation with 45-second intervals, up to 8 attempts (6 minutes total)
defer g.GinkgoRecover()

validateEtcdRecoveryState(oc, etcdClientFactory,
&peerNode,
&targetNode, true, false, // targetNode expected started == true, learner == false
6*time.Minute, 45*time.Second)

})
})

func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {
Expand Down