Skip to content

Commit

Permalink
feat: support using exponential backoff between self heal attempts (a…
Browse files Browse the repository at this point in the history
…rgoproj#20275)

Signed-off-by: Alexander Matyushentsev <[email protected]>
  • Loading branch information
alexmt authored Oct 8, 2024
1 parent 6002c7d commit dc27102
Show file tree
Hide file tree
Showing 17 changed files with 1,033 additions and 723 deletions.
5 changes: 5 additions & 0 deletions assets/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/redis/go-redis/v9"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"

Expand Down Expand Up @@ -57,6 +58,9 @@ func NewCommand() *cobra.Command {
repoServerAddress string
repoServerTimeoutSeconds int
selfHealTimeoutSeconds int
selfHealBackoffTimeoutSeconds int
selfHealBackoffFactor int
selfHealBackoffCapSeconds int
statusProcessors int
operationProcessors int
glogLevel int
Expand Down Expand Up @@ -156,6 +160,14 @@ func NewCommand() *cobra.Command {
kubectl := kubeutil.NewKubectl()
clusterSharding, err := sharding.GetClusterSharding(kubeClient, settingsMgr, shardingAlgorithm, enableDynamicClusterDistribution)
errors.CheckError(err)
var selfHealBackoff *wait.Backoff
if selfHealBackoffTimeoutSeconds != 0 {
selfHealBackoff = &wait.Backoff{
Duration: time.Duration(selfHealBackoffTimeoutSeconds) * time.Second,
Factor: float64(selfHealBackoffFactor),
Cap: time.Duration(selfHealBackoffCapSeconds) * time.Second,
}
}
appController, err = controller.NewApplicationController(
namespace,
settingsMgr,
Expand All @@ -168,6 +180,7 @@ func NewCommand() *cobra.Command {
hardResyncDuration,
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
selfHealBackoff,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
metricsCacheExpiration,
Expand Down Expand Up @@ -231,7 +244,10 @@ func NewCommand() *cobra.Command {
command.Flags().IntVar(&glogLevel, "gloglevel", 0, "Set the glog logging level")
command.Flags().IntVar(&metricsPort, "metrics-port", common.DefaultPortArgoCDMetrics, "Start metrics server on given port")
command.Flags().DurationVar(&metricsCacheExpiration, "metrics-cache-expiration", env.ParseDurationFromEnv("ARGOCD_APPLICATION_CONTROLLER_METRICS_CACHE_EXPIRATION", 0*time.Second, 0, math.MaxInt64), "Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)")
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 5, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 0, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")
Expand Down
27 changes: 24 additions & 3 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ type ApplicationController struct {
statusHardRefreshTimeout time.Duration
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
selfHealBackOff *wait.Backoff
repoClientset apiclient.Clientset
db db.ArgoDB
settingsMgr *settings_util.SettingsManager
Expand Down Expand Up @@ -160,6 +161,7 @@ func NewApplicationController(
appHardResyncPeriod time.Duration,
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
selfHealBackoff *wait.Backoff,
repoErrorGracePeriod time.Duration,
metricsPort int,
metricsCacheExpiration time.Duration,
Expand Down Expand Up @@ -201,6 +203,7 @@ func NewApplicationController(
auditLogger: argo.NewAuditLogger(namespace, kubeClientset, common.ApplicationController, enableK8sEvent),
settingsMgr: settingsMgr,
selfHealTimeout: selfHealTimeout,
selfHealBackOff: selfHealBackoff,
clusterSharding: clusterSharding,
projByNameCache: sync.Map{},
applicationNamespaces: applicationNamespaces,
Expand Down Expand Up @@ -1985,6 +1988,9 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
InitiatedBy: appv1.OperationInitiator{Automated: true},
Retry: appv1.RetryStrategy{Limit: 5},
}
if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
}
if app.Spec.SyncPolicy.Retry != nil {
op.Retry = *app.Spec.SyncPolicy.Retry
}
Expand All @@ -2002,6 +2008,7 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
return nil, 0
} else if alreadyAttempted && selfHeal {
if shouldSelfHeal, retryAfter := ctrl.shouldSelfHeal(app); shouldSelfHeal {
op.Sync.SelfHealAttemptsCount++
for _, resource := range resources {
if resource.Status != appv1.SyncStatusCodeSynced {
op.Sync.Resources = append(op.Sync.Resources, appv1.SyncOperationResource{
Expand Down Expand Up @@ -2120,10 +2127,24 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application) (bool,
}

var retryAfter time.Duration
if app.Status.OperationState.FinishedAt == nil {
retryAfter = ctrl.selfHealTimeout
if ctrl.selfHealBackOff == nil {
if app.Status.OperationState.FinishedAt == nil {
retryAfter = ctrl.selfHealTimeout
} else {
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
}
} else {
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
backOff := *ctrl.selfHealBackOff
backOff.Steps = int(app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount)
var delay time.Duration
for backOff.Steps > 0 {
delay = backOff.Step()
}
if app.Status.OperationState.FinishedAt == nil {
retryAfter = delay
} else {
retryAfter = delay - time.Since(app.Status.OperationState.FinishedAt.Time)
}
}
return retryAfter <= 0, retryAfter
}
Expand Down
70 changes: 68 additions & 2 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"testing"
"time"

clustercache "github.com/argoproj/gitops-engine/pkg/cache"
"github.com/argoproj/gitops-engine/pkg/utils/kube/kubetest"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"

clustercache "github.com/argoproj/gitops-engine/pkg/cache"
"k8s.io/utils/ptr"

"github.com/argoproj/argo-cd/v2/common"
statecache "github.com/argoproj/argo-cd/v2/controller/cache"
Expand Down Expand Up @@ -157,6 +159,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
time.Hour,
time.Second,
time.Minute,
nil,
time.Second*10,
common.DefaultPortArgoCDMetrics,
data.metricsCacheExpiration,
Expand Down Expand Up @@ -2191,3 +2194,66 @@ func TestAlreadyAttemptSync(t *testing.T) {
assert.False(t, attempted)
})
}

func assertDurationAround(t *testing.T, expected time.Duration, actual time.Duration) {
delta := time.Second / 2
assert.GreaterOrEqual(t, expected, actual-delta)
assert.LessOrEqual(t, expected, actual+delta)
}

func TestSelfHealExponentialBackoff(t *testing.T) {
ctrl := newFakeController(&fakeData{}, nil)
ctrl.selfHealBackOff = &wait.Backoff{
Factor: 3,
Duration: 2 * time.Second,
Cap: 5 * time.Minute,
}

app := &v1alpha1.Application{
Status: v1alpha1.ApplicationStatus{
OperationState: &v1alpha1.OperationState{
Operation: v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{},
},
},
},
}

testCases := []struct {
attempts int64
finishedAt *metav1.Time
expectedDuration time.Duration
shouldSelfHeal bool
}{{
attempts: 0,
finishedAt: ptr.To(metav1.Now()),
expectedDuration: 0,
shouldSelfHeal: true,
}, {
attempts: 1,
finishedAt: ptr.To(metav1.Now()),
expectedDuration: 2 * time.Second,
shouldSelfHeal: false,
}, {
attempts: 2,
finishedAt: ptr.To(metav1.Now()),
expectedDuration: 6 * time.Second,
shouldSelfHeal: false,
}, {
attempts: 3,
finishedAt: nil,
expectedDuration: 18 * time.Second,
shouldSelfHeal: false,
}}

for i := range testCases {
tc := testCases[i]
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = tc.attempts
app.Status.OperationState.FinishedAt = tc.finishedAt
ok, duration := ctrl.shouldSelfHeal(app)
require.Equal(t, ok, tc.shouldSelfHeal)
assertDurationAround(t, tc.expectedDuration, duration)
})
}
}
7 changes: 5 additions & 2 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ data:
controller.log.level: "info"
# Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)
controller.metrics.cache.expiration: "24h0m0s"
# Specifies timeout between application self heal attempts (default 5)
controller.self.heal.timeout.seconds: "5"
# Specifies exponential backoff timeout parameters between application self heal attempts
controller.self.heal.timeout.seconds: "2"
controller.self.heal.backoff.factor: "3"
controller.self.heal.backoff.cap.seconds: "300"

# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
# Specifies if resource health should be persisted in app CRD (default true)
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,24 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.factor
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,24 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.factor
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
28 changes: 28 additions & 0 deletions manifests/core-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions manifests/crds/application-crd.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit dc27102

Please sign in to comment.