Skip to content

Commit

Permalink
Handle interrupted helm releases in applier
Browse files Browse the repository at this point in the history
  • Loading branch information
azych committed Feb 17, 2025
1 parent 7f00b13 commit c073dfc
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
7 changes: 4 additions & 3 deletions cmd/operator-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,10 @@ func run() error {
crdupgradesafety.NewPreflight(aeClient.CustomResourceDefinitions()),
}

helmApplier := &applier.Helm{
ActionClientGetter: acg,
Preflights: preflights,
helmApplier, err := applier.NewHelm(acg, coreClient, preflights, systemNamespace)

Check failure on line 410 in cmd/operator-controller/main.go

View workflow job for this annotation

GitHub Actions / unit-test-basic

undefined: systemNamespace

Check failure on line 410 in cmd/operator-controller/main.go

View workflow job for this annotation

GitHub Actions / goreleaser

undefined: systemNamespace

Check failure on line 410 in cmd/operator-controller/main.go

View workflow job for this annotation

GitHub Actions / e2e-kind

undefined: systemNamespace

Check failure on line 410 in cmd/operator-controller/main.go

View workflow job for this annotation

GitHub Actions / upgrade-e2e

undefined: systemNamespace

Check failure on line 410 in cmd/operator-controller/main.go

View workflow job for this annotation

GitHub Actions / extension-developer-e2e

undefined: systemNamespace

Check failure on line 410 in cmd/operator-controller/main.go

View workflow job for this annotation

GitHub Actions / upgrade-e2e

undefined: systemNamespace
if err != nil {
setupLog.Error(err, "unable to create helm applier")
os.Exit(1)
}

cm := contentmanager.NewManager(clientRestConfigMapper, mgr.GetConfig(), mgr.GetRESTMapper())
Expand Down
71 changes: 65 additions & 6 deletions internal/operator-controller/applier/helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ import (
"helm.sh/helm/v3/pkg/release"
"helm.sh/helm/v3/pkg/storage/driver"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
apimachyaml "k8s.io/apimachinery/pkg/util/yaml"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

Expand All @@ -36,6 +38,8 @@ const (
StateUnchanged string = "Unchanged"
StateError string = "Error"
maxHelmReleaseHistory = 10

secretTypeIndexV1 = "type=operatorframework.io/index.v1"
)

// Preflight is a check that should be run before making any changes to the cluster
Expand All @@ -54,8 +58,26 @@ type Preflight interface {
}

type Helm struct {
ActionClientGetter helmclient.ActionClientGetter
Preflights []Preflight
actionClientGetter helmclient.ActionClientGetter
secretsClientGetter corev1client.SecretsGetter
preflights []Preflight
systemNamespace string
}

func NewHelm(acg helmclient.ActionClientGetter, scg corev1client.SecretsGetter, preflights []Preflight, systemNamespace string) (*Helm, error) {
if acg == nil {
return nil, fmt.Errorf("action client getter is nil")
}
if scg == nil {
return nil, fmt.Errorf("secrets client getter is nil")
}

return &Helm{
actionClientGetter: acg,
secretsClientGetter: scg,
preflights: preflights,
systemNamespace: systemNamespace,
}, nil
}

// shouldSkipPreflight is a helper to determine if the preflight check is CRDUpgradeSafety AND
Expand Down Expand Up @@ -85,7 +107,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
}
values := chartutil.Values{}

ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
ac, err := h.actionClientGetter.ActionClientFor(ctx, ext)
if err != nil {
return nil, "", err
}
Expand All @@ -94,12 +116,12 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
labels: objectLabels,
}

rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
if err != nil {
return nil, "", err
}

for _, preflight := range h.Preflights {
for _, preflight := range h.preflights {
if shouldSkipPreflight(ctx, preflight, ext, state) {
continue
}
Expand Down Expand Up @@ -152,9 +174,28 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
return relObjects, state, nil
}

func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
logger := log.FromContext(ctx)
currentRelease, err := cl.Get(ext.GetName())

// if a release is pending at this point, that means that a helm action
// (installation/upgrade) we were attempting was likely interrupted in-flight.
// Pending release would leave us in reconciliation error loop because helm
// wouldn't be able to progress automatically from it.
//
// one of the workarounds is to try and remove all helm secrets relating to
// that pending release which should 'reset' its state communicated to helm
// and the next reconciliation should be able to successfully pick up from here
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
if err == nil && currentRelease.Info.Status.IsPending() {
logger.V(4).Info("ClusterExtension release pending", "extension", ext.GetName(), "release", currentRelease.Name)
if err = h.deleteReleaseSecrets(ctx, currentRelease.Name); err != nil {
return nil, nil, StateError, fmt.Errorf("failed deleting secrets for pending release %q: %w", currentRelease.Name, err)
}
}

if errors.Is(err, driver.ErrReleaseNotFound) {
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
i.DryRun = true
i.DryRunOption = "server"
Expand All @@ -174,6 +215,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
}

desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
upgrade.MaxHistory = maxHelmReleaseHistory
upgrade.DryRun = true
upgrade.DryRunOption = "server"
Expand Down Expand Up @@ -220,3 +262,20 @@ func (p *postrenderer) Run(renderedManifests *bytes.Buffer) (*bytes.Buffer, erro
}
return &buf, nil
}

func (h *Helm) deleteReleaseSecrets(ctx context.Context, releaseName string) error {
return h.secretsClientGetter.Secrets(h.systemNamespace).DeleteCollection(
ctx,
metav1.DeleteOptions{},
metav1.ListOptions{
FieldSelector: secretTypeIndexV1,
LabelSelector: fmt.Sprintf(
"name in (%s),status in(%s, %s, %s)",
releaseName,
release.StatusPendingInstall,
release.StatusPendingUpgrade,
release.StatusPendingRollback,
),
},
)
}

0 comments on commit c073dfc

Please sign in to comment.