Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: ETCD-612: Added a function to check if the quorum is safe #1278

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -137,5 +137,6 @@ require (

replace (
github.com/gogo/protobuf => github.com/gogo/protobuf v1.3.2
github.com/openshift/library-go => github.com/jubittajohn/library-go v0.0.0-20240723232143-3c6c63efe923
vbom.ml/util => github.com/fvbommel/util v0.0.0-20180919145318-efcd4e0f9787
)
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/jubittajohn/library-go v0.0.0-20240723232143-3c6c63efe923 h1:K04QH8qBzeS5IjNTJK3DQbDUj0yBDXyvdkPSYQ3VZxo=
github.com/jubittajohn/library-go v0.0.0-20240723232143-3c6c63efe923/go.mod h1:PdASVamWinll2BPxiUpXajTwZxV8A1pQbWEsCN1od7I=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
Expand Down Expand Up @@ -309,8 +311,6 @@ github.com/openshift/build-machinery-go v0.0.0-20240419090851-af9c868bcf52 h1:bq
github.com/openshift/build-machinery-go v0.0.0-20240419090851-af9c868bcf52/go.mod h1:b1BuldmJlbA/xYtdZvKi+7j5YGB44qJUJDZ9zwiNCfE=
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87 h1:JtLhaGpSEconE+1IKmIgCOof/Len5ceG6H1pk43yv5U=
github.com/openshift/client-go v0.0.0-20240528061634-b054aa794d87/go.mod h1:3IPD4U0qyovZS4EFady2kqY32m8lGcbs/Wx+yprg9z8=
github.com/openshift/library-go v0.0.0-20240619140217-e20ca28ddfe7 h1:kgkHtO+fI1OdanQHZYTQgImqzTzS4naBw8SMQbB18DI=
github.com/openshift/library-go v0.0.0-20240619140217-e20ca28ddfe7/go.mod h1:PdASVamWinll2BPxiUpXajTwZxV8A1pQbWEsCN1od7I=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
Expand Down
23 changes: 0 additions & 23 deletions pkg/operator/etcdcertsigner/etcdcertsignercontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,18 +392,6 @@ func (c *EtcdCertSignerController) syncLeafCertificates(
Type: corev1.SecretTypeOpaque,
Data: allCerts,
}

// check the quorum in case the cluster is healthy or not after generating certs, unless we're in force mode
if !forceSkipRollout {
safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("EtcdCertSignerController can't evaluate whether quorum is safe: %w", err)
}

if !safe {
return fmt.Errorf("skipping EtcdCertSignerController reconciliation due to insufficient quorum")
}
}
_, _, err = resourceapply.ApplySecret(ctx, c.secretClient, recorder, secret)
return err
}
Expand Down Expand Up @@ -474,17 +462,6 @@ func (c *EtcdCertSignerController) ensureBundles(ctx context.Context,
// the leaf certificates are not regenerated too early.
configMap.Annotations[BundleRolloutRevisionAnnotation] = fmt.Sprintf("%d", currentRevision)

// The rollout may be stuck due to a missing etcd-all-bundles configmap, so we override the quorum check here to regenerate the configmap and let the next revision install
if !forceSkipRollout {
safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return nil, nil, false, fmt.Errorf("EtcdCertSignerController.ensureBundles can't evaluate whether quorum is safe: %w", err)
}
if !safe {
return nil, nil, false, fmt.Errorf("skipping EtcdCertSignerController.ensureBundles reconciliation due to insufficient quorum")
}
}

_, rolloutTriggered, err = resourceapply.ApplyConfigMap(ctx, c.configMapClient, recorder, configMap)
if err != nil {
return nil, nil, rolloutTriggered, fmt.Errorf("could not apply bundle configmap: %w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,6 @@ func (c *EtcdEndpointsController) syncConfigMap(ctx context.Context, recorder ev

required.Data = endpointAddresses

safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("EtcdEndpointsController can't evaluate whether quorum is safe: %w", err)
}
if !safe {
return fmt.Errorf("skipping EtcdEndpointsController reconciliation due to insufficient quorum")
}

// Apply endpoint updates
if _, _, err := resourceapply.ApplyConfigMap(ctx, c.configmapClient, recorder, required); err != nil {
return fmt.Errorf("applying configmap update failed :%w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package etcdendpointscontroller

import (
"context"
"fmt"
"testing"

configv1 "github.com/openshift/api/config/v1"
Expand Down Expand Up @@ -269,29 +268,6 @@ func TestBootstrapAnnotationRemoval(t *testing.T) {
}
},
},
{
// The configmap should not update when quorum is critical
name: "ClusterNotUpdateWithMemberChangeViolatingQuorum",
objects: []runtime.Object{
u.BootstrapConfigMap(u.WithBootstrapStatus("complete")),
u.EndpointsConfigMap(
u.WithEndpoint(etcdMembers[0].ID, etcdMembers[0].PeerURLs[0]),
u.WithEndpoint(etcdMembers[1].ID, etcdMembers[1].PeerURLs[0]),
u.WithEndpoint(etcdMembers[2].ID, etcdMembers[2].PeerURLs[0]),
),
},
staticPodStatus: u.StaticPodOperatorStatus(
u.WithLatestRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
),
expectBootstrap: false,
etcdMembers: []*etcdserverpb.Member{
u.FakeEtcdMemberWithoutServer(0),
},
expectedErr: fmt.Errorf("EtcdEndpointsController can't evaluate whether quorum is safe: %w", fmt.Errorf("etcd cluster has quorum of 1 which is not fault tolerant: [{Member:name:\"etcd-0\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2907\" Healthy:true Took: Error:<nil>}]")),
},
{
// The configmap should be created without a bootstrap IP because the
// only time the configmap won't already exist is when we've upgraded
Expand Down
5 changes: 5 additions & 0 deletions pkg/operator/starter.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,10 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
return !isSNO, precheckSucceeded, err
}

quorumSafe := func(ctx context.Context) (bool, error) {
return quorumChecker.IsSafeToUpdateRevision()
}

staticPodControllers, err := staticpod.NewBuilder(operatorClient, kubeClient, kubeInformersForNamespaces, configInformers).
WithEvents(controllerContext.EventRecorder).
WithInstaller([]string{"cluster-etcd-operator", "installer"}).
Expand All @@ -322,6 +326,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
guardRolloutPreCheck,
).
WithOperandPodLabelSelector(labels.Set{"etcd": "true"}.AsSelector()).
WithInstallPrecondition(quorumSafe).
ToControllers()
if err != nil {
return err
Expand Down
18 changes: 0 additions & 18 deletions pkg/operator/targetconfigcontroller/targetconfigcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,6 @@ func NewTargetConfigController(
}

func (c TargetConfigController) sync(ctx context.Context, syncCtx factory.SyncContext) error {
safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("TargetConfigController can't evaluate whether quorum is safe: %w", err)
}

if !safe {
return fmt.Errorf("skipping TargetConfigController reconciliation due to insufficient quorum")
}

envVars := c.envVarGetter.GetEnvVars()
if len(envVars) == 0 {
return fmt.Errorf("TargetConfigController missing env var values")
Expand All @@ -105,15 +96,6 @@ func (c TargetConfigController) sync(ctx context.Context, syncCtx factory.SyncCo
if err != nil {
return err
}
// check the cluster is healthy or not after get env var, to ensure it is safe to rollout
safe, err = c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("TargetConfigController can't evaluate whether quorum is safe: %w", err)
}

if !safe {
return fmt.Errorf("skipping TargetConfigController reconciliation due to insufficient quorum")
}
requeue, err := createTargetConfig(ctx, c, syncCtx.Recorder(), operatorSpec, envVars)
if err != nil {
return err
Expand Down
19 changes: 0 additions & 19 deletions pkg/operator/targetconfigcontroller/targetconfigcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package targetconfigcontroller

import (
"context"
"fmt"
"testing"

configv1 "github.com/openshift/api/config/v1"
Expand Down Expand Up @@ -57,24 +56,6 @@ func TestTargetConfigController(t *testing.T) {
},
etcdMembersEnvVar: "1,2,3",
},
{
name: "Quorum not fault tolerant",
objects: []runtime.Object{
u.BootstrapConfigMap(u.WithBootstrapStatus("complete")),
},
staticPodStatus: u.StaticPodOperatorStatus(
u.WithLatestRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
),
etcdMembers: []*etcdserverpb.Member{
u.FakeEtcdMemberWithoutServer(0),
u.FakeEtcdMemberWithoutServer(2),
},
etcdMembersEnvVar: "1,3",
expectedErr: fmt.Errorf("TargetConfigController can't evaluate whether quorum is safe: %w", fmt.Errorf("etcd cluster has quorum of 2 which is not fault tolerant: [{Member:name:\"etcd-0\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2907\" Healthy:true Took: Error:<nil>} {Member:ID:2 name:\"etcd-2\" peerURLs:\"https://10.0.0.3:2380\" clientURLs:\"https://10.0.0.3:2907\" Healthy:true Took: Error:<nil>}]")),
},
{
name: "Quorum not fault tolerant but bootstrapping",
objects: []runtime.Object{
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion vendor/modules.txt
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ github.com/openshift/client-go/operator/informers/externalversions/operator/v1
github.com/openshift/client-go/operator/informers/externalversions/operator/v1alpha1
github.com/openshift/client-go/operator/listers/operator/v1
github.com/openshift/client-go/operator/listers/operator/v1alpha1
# github.com/openshift/library-go v0.0.0-20240619140217-e20ca28ddfe7
# github.com/openshift/library-go v0.0.0-20240619140217-e20ca28ddfe7 => github.com/jubittajohn/library-go v0.0.0-20240723232143-3c6c63efe923
## explicit; go 1.22.0
github.com/openshift/library-go/pkg/assets
github.com/openshift/library-go/pkg/authorization/hardcodedauthorizer
Expand Down Expand Up @@ -1542,4 +1542,5 @@ sigs.k8s.io/structured-merge-diff/v4/value
## explicit; go 1.12
sigs.k8s.io/yaml
# github.com/gogo/protobuf => github.com/gogo/protobuf v1.3.2
# github.com/openshift/library-go => github.com/jubittajohn/library-go v0.0.0-20240723232143-3c6c63efe923
# vbom.ml/util => github.com/fvbommel/util v0.0.0-20180919145318-efcd4e0f9787