Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Handle interrupted helm releases in applier #1776

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions cmd/operator-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,10 @@ func run() error {
crdupgradesafety.NewPreflight(aeClient.CustomResourceDefinitions()),
}

helmApplier := &applier.Helm{
ActionClientGetter: acg,
Preflights: preflights,
helmApplier, err := applier.NewHelm(acg, preflights, cfg.systemNamespace)
if err != nil {
setupLog.Error(err, "unable to create helm applier")
return err
}

cm := contentmanager.NewManager(clientRestConfigMapper, mgr.GetConfig(), mgr.GetRESTMapper())
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ require (
github.com/opencontainers/go-digest v1.0.0
github.com/opencontainers/image-spec v1.1.0
github.com/operator-framework/api v0.29.0
github.com/operator-framework/helm-operator-plugins v0.8.0
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
github.com/operator-framework/operator-registry v1.50.0
github.com/prometheus/client_golang v1.21.0
github.com/sirupsen/logrus v1.9.3
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
github.com/operator-framework/api v0.29.0 h1:TxAR8RCO+I4FjRrY4PSMgnlmbxNWeD8pzHXp7xwHNmw=
github.com/operator-framework/api v0.29.0/go.mod h1:0whQE4mpMDd2zyHkQe+bFa3DLoRs6oGWCbu8dY/3pyc=
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4=
Expand Down
6 changes: 6 additions & 0 deletions internal/operator-controller/action/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
return args.Error(0)
}

func (m *mockActionClient) Config() *action.Configuration {
args := m.Called()
return args.Get(0).(*action.Configuration)
}

var _ actionclient.ActionClientGetter = &mockActionClientGetter{}

type mockActionClientGetter struct {
Expand Down
47 changes: 41 additions & 6 deletions internal/operator-controller/applier/helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,21 @@ type Preflight interface {
}

type Helm struct {
ActionClientGetter helmclient.ActionClientGetter
Preflights []Preflight
actionClientGetter helmclient.ActionClientGetter
preflights []Preflight
systemNamespace string
}

func NewHelm(acg helmclient.ActionClientGetter, preflights []Preflight, systemNamespace string) (*Helm, error) {
if acg == nil {
return nil, fmt.Errorf("action client getter is nil")
}

return &Helm{
actionClientGetter: acg,
preflights: preflights,
systemNamespace: systemNamespace,
}, nil
}

// shouldSkipPreflight is a helper to determine if the preflight check is CRDUpgradeSafety AND
Expand Down Expand Up @@ -85,7 +98,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
}
values := chartutil.Values{}

ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
ac, err := h.actionClientGetter.ActionClientFor(ctx, ext)
if err != nil {
return nil, "", err
}
Expand All @@ -94,12 +107,12 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
labels: objectLabels,
}

rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
if err != nil {
return nil, "", err
}

for _, preflight := range h.Preflights {
for _, preflight := range h.preflights {
if shouldSkipPreflight(ctx, preflight, ext, state) {
continue
}
Expand Down Expand Up @@ -152,9 +165,30 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
return relObjects, state, nil
}

func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
logger := log.FromContext(ctx)
currentRelease, err := cl.Get(ext.GetName())

// if a release is pending at this point, that means that a helm action
// (installation/upgrade) we were attempting was likely interrupted in-flight.
// Pending release would leave us in reconciliation error loop because helm
// wouldn't be able to progress automatically from it.
//
// one of the workarounds is to try and remove helm metadata relating to
// that pending release which should 'reset' its state communicated to helm
// and the next reconciliation should be able to successfully pick up from here
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
if err == nil && currentRelease.Info.Status.IsPending() {
if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil {
return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err)
}
// return error to try to detect proper state (installation/upgrade) at next reconciliation
return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version)
}

if errors.Is(err, driver.ErrReleaseNotFound) {
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
i.DryRun = true
i.DryRunOption = "server"
Expand All @@ -174,6 +208,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
}

desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
upgrade.MaxHistory = maxHelmReleaseHistory
upgrade.DryRun = true
upgrade.DryRunOption = "server"
Expand Down
120 changes: 105 additions & 15 deletions internal/operator-controller/applier/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"helm.sh/helm/v3/pkg/storage"
"helm.sh/helm/v3/pkg/storage/driver"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -46,6 +47,7 @@ type mockActionGetter struct {
reconcileErr error
desiredRel *release.Release
currentRel *release.Release
config *action.Configuration
}

func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
Expand Down Expand Up @@ -94,6 +96,12 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
return mag.reconcileErr
}

func (mag *mockActionGetter) Config() *action.Configuration {
// TODO
// storage.Init(driver.NewMemory()).
return mag.config
}

var (
// required for unmockable call to convert.RegistryV1ToHelmChart
validFS = fstest.MapFS{
Expand Down Expand Up @@ -144,7 +152,8 @@ func TestApply_Base(t *testing.T) {

t.Run("fails trying to obtain an action client", func(t *testing.T) {
mockAcg := &mockActionGetter{actionClientForErr: errors.New("failed getting action client")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -155,7 +164,8 @@ func TestApply_Base(t *testing.T) {

t.Run("fails getting current release and !driver.ErrReleaseNotFound", func(t *testing.T) {
mockAcg := &mockActionGetter{getClientErr: errors.New("failed getting current release")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -165,13 +175,80 @@ func TestApply_Base(t *testing.T) {
})
}

func TestApply_InterruptedRelease(t *testing.T) {
t.Run("fails removing an interrupted install release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
testStorage := storage.Init(driver.NewMemory())

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("fails removing an interrupted upgrade release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
testStorage := storage.Init(driver.NewMemory())

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("successfully removes an interrupted install release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
testStorage := storage.Init(driver.NewMemory())
err := testStorage.Create(testRel)
require.NoError(t, err)

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removed interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
testStorage := storage.Init(driver.NewMemory())
err := testStorage.Create(testRel)
require.NoError(t, err)

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removed interrupted release")
require.Nil(t, objs)
require.Empty(t, state)
})
}

func TestApply_Installation(t *testing.T) {
t.Run("fails during dry-run installation", func(t *testing.T) {
mockAcg := &mockActionGetter{
getClientErr: driver.ErrReleaseNotFound,
dryRunInstallErr: errors.New("failed attempting to dry-run install chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -186,7 +263,8 @@ func TestApply_Installation(t *testing.T) {
installErr: errors.New("failed installing chart"),
}
mockPf := &mockPreflight{installErr: errors.New("failed during install pre-flight check")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -200,7 +278,8 @@ func TestApply_Installation(t *testing.T) {
getClientErr: driver.ErrReleaseNotFound,
installErr: errors.New("failed installing chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -217,7 +296,8 @@ func TestApply_Installation(t *testing.T) {
Manifest: validManifest,
},
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.NoError(t, err)
Expand All @@ -236,7 +316,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
getClientErr: driver.ErrReleaseNotFound,
dryRunInstallErr: errors.New("failed attempting to dry-run install chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -251,7 +332,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
installErr: errors.New("failed installing chart"),
}
mockPf := &mockPreflight{installErr: errors.New("failed during install pre-flight check")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -265,7 +347,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
getClientErr: driver.ErrReleaseNotFound,
installErr: errors.New("failed installing chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -282,7 +365,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
Manifest: validManifest,
},
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.NoError(t, err)
Expand All @@ -300,9 +384,11 @@ func TestApply_Upgrade(t *testing.T) {

t.Run("fails during dry-run upgrade", func(t *testing.T) {
mockAcg := &mockActionGetter{
currentRel: testCurrentRelease,
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -321,7 +407,8 @@ func TestApply_Upgrade(t *testing.T) {
desiredRel: &testDesiredRelease,
}
mockPf := &mockPreflight{upgradeErr: errors.New("failed during upgrade pre-flight check")}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -340,7 +427,8 @@ func TestApply_Upgrade(t *testing.T) {
desiredRel: &testDesiredRelease,
}
mockPf := &mockPreflight{}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -359,7 +447,8 @@ func TestApply_Upgrade(t *testing.T) {
desiredRel: &testDesiredRelease,
}
mockPf := &mockPreflight{}
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
Expand All @@ -376,7 +465,8 @@ func TestApply_Upgrade(t *testing.T) {
currentRel: testCurrentRelease,
desiredRel: &testDesiredRelease,
}
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
require.NoError(t, err)

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.NoError(t, err)
Expand Down
Loading
Loading