From 0ee61e205827a3e6abdc96e33cfdef0e5ce4f56d Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Fri, 31 Jan 2025 03:37:20 +0000 Subject: [PATCH] fix(e2e): wait for leader election & measure timing for better monitoring TestClusterExtensionAfterOLMUpgrade was failing due to increased leader election timeouts, causing reconciliation checks to run before leadership was acquired. This fix ensures the test explicitly waits for leader election logs (`"successfully acquired lease"`) before verifying reconciliation. Additionally, the test now measures and logs the leader election duration to help monitor election timing. --- catalogd/cmd/catalogd/main.go | 6 +++--- cmd/operator-controller/main.go | 6 +++--- test/upgrade-e2e/post_upgrade_test.go | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/catalogd/cmd/catalogd/main.go b/catalogd/cmd/catalogd/main.go index 91d82bedd..66c455a10 100644 --- a/catalogd/cmd/catalogd/main.go +++ b/catalogd/cmd/catalogd/main.go @@ -235,9 +235,9 @@ func main() { LeaderElectionID: "catalogd-operator-lock", // Recommended Leader Election values // https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption - LeaseDuration: ptr.To(137 * time.Second), - RenewDeadline: ptr.To(107 * time.Second), - RetryPeriod: ptr.To(26 * time.Second), + LeaseDuration: ptr.To(137 * time.Second), // Default: 15s + RenewDeadline: ptr.To(107 * time.Second), // Default: 10s + RetryPeriod: ptr.To(26 * time.Second), // Default: 2s WebhookServer: webhookServer, Cache: cacheOptions, diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index 76c0e4af4..1e3c1a699 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -235,9 +235,9 @@ func main() { LeaderElectionID: "9c4404e7.operatorframework.io", // Recommended Leader Election values // https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption - LeaseDuration: ptr.To(137 * time.Second), - RenewDeadline: ptr.To(107 * time.Second), - RetryPeriod: ptr.To(26 * time.Second), + LeaseDuration: ptr.To(137 * time.Second), // Default: 15s + RenewDeadline: ptr.To(107 * time.Second), // Default: 10s + RetryPeriod: ptr.To(26 * time.Second), // Default: 2s Cache: cacheOptions, // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go index 204c79330..e91b8873e 100644 --- a/test/upgrade-e2e/post_upgrade_test.go +++ b/test/upgrade-e2e/post_upgrade_test.go @@ -40,6 +40,21 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { t.Log("Wait for operator-controller deployment to be ready") managerPod := waitForDeployment(t, ctx, "operator-controller-controller-manager") + t.Log("Start measuring leader election time") + // - Best case (new pod starts): ~1–5 seconds + // - Average case (leader exists, but lease not expired): +/-26–52 seconds + // - Worst case (leader was there but crashed): LeaseDuration (137s) + RetryPeriod (26s) +/- 163 secs + leaderStartTime := time.Now() + leaderElectionCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute) + defer leaderCancel() + + leaderSubstrings := []string{"successfully acquired lease"} + leaderElected, err := watchPodLogsForSubstring(leaderElectionCtx, managerPod, "manager", leaderSubstrings...) + require.NoError(t, err) + require.True(t, leaderElected) + leaderElectionDuration := time.Since(leaderStartTime) + t.Logf("Leader election took %v seconds", leaderElectionDuration.Seconds()) + t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it") // Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes logCtx, cancel := context.WithTimeout(ctx, time.Minute)