diff --git a/catalogd/cmd/catalogd/main.go b/catalogd/cmd/catalogd/main.go index 91d82bedd..66c455a10 100644 --- a/catalogd/cmd/catalogd/main.go +++ b/catalogd/cmd/catalogd/main.go @@ -235,9 +235,9 @@ func main() { LeaderElectionID: "catalogd-operator-lock", // Recommended Leader Election values // https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption - LeaseDuration: ptr.To(137 * time.Second), - RenewDeadline: ptr.To(107 * time.Second), - RetryPeriod: ptr.To(26 * time.Second), + LeaseDuration: ptr.To(137 * time.Second), // Default: 15s + RenewDeadline: ptr.To(107 * time.Second), // Default: 10s + RetryPeriod: ptr.To(26 * time.Second), // Default: 2s WebhookServer: webhookServer, Cache: cacheOptions, diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index 76c0e4af4..1e3c1a699 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -235,9 +235,9 @@ func main() { LeaderElectionID: "9c4404e7.operatorframework.io", // Recommended Leader Election values // https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption - LeaseDuration: ptr.To(137 * time.Second), - RenewDeadline: ptr.To(107 * time.Second), - RetryPeriod: ptr.To(26 * time.Second), + LeaseDuration: ptr.To(137 * time.Second), // Default: 15s + RenewDeadline: ptr.To(107 * time.Second), // Default: 10s + RetryPeriod: ptr.To(26 * time.Second), // Default: 2s Cache: cacheOptions, // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go index 204c79330..e91b8873e 100644 --- a/test/upgrade-e2e/post_upgrade_test.go +++ b/test/upgrade-e2e/post_upgrade_test.go @@ -40,6 +40,21 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { t.Log("Wait for operator-controller deployment to be ready") managerPod := waitForDeployment(t, ctx, "operator-controller-controller-manager") + t.Log("Start measuring leader election time") + // - Best case (new pod starts): ~1–5 seconds + // - Average case (leader exists, but lease not expired): +/-26–52 seconds + // - Worst case (leader was there but crashed): LeaseDuration (137s) + RetryPeriod (26s) +/- 163 secs + leaderStartTime := time.Now() + leaderElectionCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute) + defer leaderCancel() + + leaderSubstrings := []string{"successfully acquired lease"} + leaderElected, err := watchPodLogsForSubstring(leaderElectionCtx, managerPod, "manager", leaderSubstrings...) + require.NoError(t, err) + require.True(t, leaderElected) + leaderElectionDuration := time.Since(leaderStartTime) + t.Logf("Leader election took %v seconds", leaderElectionDuration.Seconds()) + t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it") // Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes logCtx, cancel := context.WithTimeout(ctx, time.Minute)