Skip to content

Commit

Permalink
Handle interrupted helm releases in applier
Browse files Browse the repository at this point in the history
Introduces a workaround for 'interrupted' helm releases
which enter into a 'pending' (-install/uninstall/rollback) state.
If that happens, for example because of immediate application
exit with one of those operations being in flight, helm is not
able to resolve it automatically which means we end up in
a permanent reconcile error state.
One of the workarounds for this that has been repeated in the
community is to remove metadata of the pending release,
which is the solution chosen here.

For full context see:
#1776
helm/helm#5595
  • Loading branch information
azych committed Feb 24, 2025
1 parent 9b888eb commit 7a31a06
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 27 deletions.
7 changes: 4 additions & 3 deletions cmd/operator-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,10 @@ func run() error {
crdupgradesafety.NewPreflight(aeClient.CustomResourceDefinitions()),
}

helmApplier := &applier.Helm{
ActionClientGetter: acg,
Preflights: preflights,
helmApplier, err := applier.NewHelm(acg, preflights, cfg.systemNamespace)
if err != nil {
setupLog.Error(err, "unable to create helm applier")
return err
}

cm := contentmanager.NewManager(clientRestConfigMapper, mgr.GetConfig(), mgr.GetRESTMapper())
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ require (
github.com/opencontainers/go-digest v1.0.0
github.com/opencontainers/image-spec v1.1.0
github.com/operator-framework/api v0.29.0
github.com/operator-framework/helm-operator-plugins v0.8.0
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
github.com/operator-framework/operator-registry v1.50.0
github.com/prometheus/client_golang v1.21.0
github.com/sirupsen/logrus v1.9.3
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
github.com/operator-framework/api v0.29.0 h1:TxAR8RCO+I4FjRrY4PSMgnlmbxNWeD8pzHXp7xwHNmw=
github.com/operator-framework/api v0.29.0/go.mod h1:0whQE4mpMDd2zyHkQe+bFa3DLoRs6oGWCbu8dY/3pyc=
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4=
Expand Down
6 changes: 6 additions & 0 deletions internal/operator-controller/action/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
return args.Error(0)
}

func (m *mockActionClient) Config() *action.Configuration {
args := m.Called()
return args.Get(0).(*action.Configuration)
}

var _ actionclient.ActionClientGetter = &mockActionClientGetter{}

type mockActionClientGetter struct {
Expand Down
47 changes: 41 additions & 6 deletions internal/operator-controller/applier/helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,21 @@ type Preflight interface {
}

type Helm struct {
ActionClientGetter helmclient.ActionClientGetter
Preflights []Preflight
actionClientGetter helmclient.ActionClientGetter
preflights []Preflight
systemNamespace string
}

func NewHelm(acg helmclient.ActionClientGetter, preflights []Preflight, systemNamespace string) (*Helm, error) {
if acg == nil {
return nil, fmt.Errorf("action client getter is nil")
}

return &Helm{
actionClientGetter: acg,
preflights: preflights,
systemNamespace: systemNamespace,
}, nil
}

// shouldSkipPreflight is a helper to determine if the preflight check is CRDUpgradeSafety AND
Expand Down Expand Up @@ -85,7 +98,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
}
values := chartutil.Values{}

ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
ac, err := h.actionClientGetter.ActionClientFor(ctx, ext)
if err != nil {
return nil, "", err
}
Expand All @@ -94,12 +107,12 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
labels: objectLabels,
}

rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
if err != nil {
return nil, "", err
}

for _, preflight := range h.Preflights {
for _, preflight := range h.preflights {
if shouldSkipPreflight(ctx, preflight, ext, state) {
continue
}
Expand Down Expand Up @@ -152,9 +165,30 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
return relObjects, state, nil
}

func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
logger := log.FromContext(ctx)
currentRelease, err := cl.Get(ext.GetName())

// if a release is pending at this point, that means that a helm action
// (installation/upgrade) we were attempting was likely interrupted in-flight.
// Pending release would leave us in reconciliation error loop because helm
// wouldn't be able to progress automatically from it.
//
// one of the workarounds is to try and remove helm metadata relating to
// that pending release which should 'reset' its state communicated to helm
// and the next reconciliation should be able to successfully pick up from here
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
if err == nil && currentRelease.Info.Status.IsPending() {
if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil {
return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err)
}
// return error to try to detect proper state (installation/upgrade) at next reconciliation
return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version)
}

if errors.Is(err, driver.ErrReleaseNotFound) {
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
i.DryRun = true
i.DryRunOption = "server"
Expand All @@ -174,6 +208,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
}

desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
upgrade.MaxHistory = maxHelmReleaseHistory
upgrade.DryRun = true
upgrade.DryRunOption = "server"
Expand Down
119 changes: 104 additions & 15 deletions internal/operator-controller/applier/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"helm.sh/helm/v3/pkg/storage"
"helm.sh/helm/v3/pkg/storage/driver"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -46,6 +47,7 @@ type mockActionGetter struct {
reconcileErr error
desiredRel *release.Release
currentRel *release.Release
config *action.Configuration
}

func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
Expand Down Expand Up @@ -94,6 +96,12 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
return mag.reconcileErr
}

func (mag *mockActionGetter) Config() *action.Configuration {
// TODO
// storage.Init(driver.NewMemory()).
return mag.config
}

var (
// required for unmockable call to convert.RegistryV1ToHelmChart
validFS = fstest.MapFS{
Expand Down Expand Up @@ -144,7 +152,8 @@ func TestApply_Base(t *testing.T) {

t.Run("fails trying to obtain an action client", func(t *testing.T) {
mockAcg := &mockActionGetter{actionClientForErr: errors.New("failed getting action client")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -155,7 +164,8 @@ func TestApply_Base(t *testing.T) {

t.Run("fails getting current release and !driver.ErrReleaseNotFound", func(t *testing.T) {
mockAcg := &mockActionGetter{getClientErr: errors.New("failed getting current release")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -165,13 +175,80 @@ func TestApply_Base(t *testing.T) {
})
}

func TestApply_InterruptedRelease(t *testing.T) {
t.Run("fails removing an interrupted install release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
testStorage := storage.Init(driver.NewMemory())

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("fails removing an interrupted upgrade release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
testStorage := storage.Init(driver.NewMemory())

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removed interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("successfully removes an interrupted install release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
testStorage := storage.Init(driver.NewMemory())
err := testStorage.Create(testRel)
require.NoError(t, err)

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
testStorage := storage.Init(driver.NewMemory())
err := testStorage.Create(testRel)
require.NoError(t, err)

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})
}

func TestApply_Installation(t *testing.T) {
t.Run("fails during dry-run installation", func(t *testing.T) {
mockAcg := &mockActionGetter{
getClientErr: driver.ErrReleaseNotFound,
dryRunInstallErr: errors.New("failed attempting to dry-run install chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -186,7 +263,8 @@ func TestApply_Installation(t *testing.T) {
installErr: errors.New("failed installing chart"),
}
mockPf := &mockPreflight{installErr: errors.New("failed during install pre-flight check")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -200,7 +278,8 @@ func TestApply_Installation(t *testing.T) {
getClientErr: driver.ErrReleaseNotFound,
installErr: errors.New("failed installing chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -217,7 +296,8 @@ func TestApply_Installation(t *testing.T) {
Manifest: validManifest,
},
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.NoError(t, err)
Expand All @@ -236,7 +316,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
getClientErr: driver.ErrReleaseNotFound,
dryRunInstallErr: errors.New("failed attempting to dry-run install chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -251,7 +332,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
installErr: errors.New("failed installing chart"),
}
mockPf := &mockPreflight{installErr: errors.New("failed during install pre-flight check")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -265,7 +347,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
getClientErr: driver.ErrReleaseNotFound,
installErr: errors.New("failed installing chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -282,7 +365,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
Manifest: validManifest,
},
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.NoError(t, err)
Expand All @@ -302,7 +386,8 @@ func TestApply_Upgrade(t *testing.T) {
mockAcg := &mockActionGetter{
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -321,7 +406,8 @@ func TestApply_Upgrade(t *testing.T) {
desiredRel: &testDesiredRelease,
}
mockPf := &mockPreflight{upgradeErr: errors.New("failed during upgrade pre-flight check")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -340,7 +426,8 @@ func TestApply_Upgrade(t *testing.T) {
desiredRel: &testDesiredRelease,
}
mockPf := &mockPreflight{}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -359,7 +446,8 @@ func TestApply_Upgrade(t *testing.T) {
desiredRel: &testDesiredRelease,
}
mockPf := &mockPreflight{}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -376,7 +464,8 @@ func TestApply_Upgrade(t *testing.T) {
currentRel: testCurrentRelease,
desiredRel: &testDesiredRelease,
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.NoError(t, err)
Expand Down
Loading

0 comments on commit 7a31a06

Please sign in to comment.