Skip to content

Commit

Permalink
cp(v0.33.x): feat: invalidate SSM cache upon AMI deprecation (#7301) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jmdeal authored Nov 7, 2024
1 parent 80c647e commit cfa6aa5
Show file tree
Hide file tree
Showing 14 changed files with 444 additions and 67 deletions.
1 change: 1 addition & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ func main() {
op.GetClient(),
op.EventRecorder,
op.UnavailableOfferingsCache,
op.SSMCache,
awsCloudProvider,
op.SubnetProvider,
op.SecurityGroupProvider,
Expand Down
36 changes: 30 additions & 6 deletions pkg/apis/v1/ec2nodeclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,16 +487,40 @@ func (in *EC2NodeClass) AMIFamily() string {
if in.Spec.AMIFamily != nil {
return *in.Spec.AMIFamily
}
if term, ok := lo.Find(in.Spec.AMISelectorTerms, func(t AMISelectorTerm) bool {
return t.Alias != ""
}); ok {
return AMIFamilyFromAlias(term.Alias)
if alias := in.Alias(); alias != nil {
return alias.Family
}
// Unreachable: validation enforces that one of the above conditions must be met
return AMIFamilyCustom
}

func AMIFamilyFromAlias(alias string) string {
type Alias struct {
Family string
Version string
}

const (
AliasVersionLatest = "latest"
)

func (a *Alias) String() string {
return fmt.Sprintf("%s@%s", a.Family, a.Version)
}

func (in *EC2NodeClass) Alias() *Alias {
term, ok := lo.Find(in.Spec.AMISelectorTerms, func(term AMISelectorTerm) bool {
return term.Alias != ""
})
if !ok {
return nil
}
return &Alias{
Family: amiFamilyFromAlias(term.Alias),
Version: amiVersionFromAlias(term.Alias),
}
}

func amiFamilyFromAlias(alias string) string {
components := strings.Split(alias, "@")
if len(components) != 2 {
log.Fatalf("failed to parse AMI alias %q, invalid format", alias)
Expand All @@ -516,7 +540,7 @@ func AMIFamilyFromAlias(alias string) string {
return family
}

func AMIVersionFromAlias(alias string) string {
func amiVersionFromAlias(alias string) string {
components := strings.Split(alias, "@")
if len(components) != 2 {
log.Fatalf("failed to parse AMI alias %q, invalid format", alias)
Expand Down
13 changes: 4 additions & 9 deletions pkg/apis/v1/ec2nodeclass_conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,10 @@ func (in *EC2NodeClass) ConvertTo(ctx context.Context, to apis.Convertible) erro
v1beta1enc.Spec.AMIFamily = lo.ToPtr(in.AMIFamily())
}

if term, ok := lo.Find(in.Spec.AMISelectorTerms, func(term AMISelectorTerm) bool {
return term.Alias != ""
}); ok {
version := AMIVersionFromAlias(term.Alias)
if version != "latest" {
v1beta1enc.Annotations = lo.Assign(v1beta1enc.Annotations, map[string]string{
AnnotationAliasVersionCompatibilityKey: version,
})
}
if alias := in.Alias(); alias != nil && alias.Version != AliasVersionLatest {
v1beta1enc.Annotations = lo.Assign(v1beta1enc.Annotations, map[string]string{
AnnotationAliasVersionCompatibilityKey: alias.Version,
})
}

in.Spec.convertTo(&v1beta1enc.Spec)
Expand Down
15 changes: 15 additions & 0 deletions pkg/apis/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pkg/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ const (
InstanceTypesAndZonesTTL = 5 * time.Minute
// InstanceProfileTTL is the time before we refresh checking instance profile existence at IAM
InstanceProfileTTL = 15 * time.Minute
// SSMProviderTTL is the time to drop SSM Provider data. This only queries EKS Optimized AMI
// SSMCacheTTL is the time to drop SSM Parameters by path data. This only queries EKS Optimized AMI
// releases, so we should expect this to be updated relatively infrequently.
SSMProviderTTL = 24 * time.Hour
SSMCacheTTL = 24 * time.Hour
)

const (
Expand Down
32 changes: 23 additions & 9 deletions pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,24 @@ package controllers
import (
"context"

"github.com/aws/aws-sdk-go/aws/session"
servicesqs "github.com/aws/aws-sdk-go/service/sqs"
cache "github.com/patrickmn/go-cache"
"github.com/samber/lo"
"k8s.io/utils/clock"
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"

"sigs.k8s.io/karpenter/pkg/events"
"sigs.k8s.io/karpenter/pkg/operator/controller"

"github.com/aws/karpenter/pkg/cache"
"github.com/aws/aws-sdk-go/aws/session"
servicesqs "github.com/aws/aws-sdk-go/service/sqs"

awscache "github.com/aws/karpenter/pkg/cache"
"github.com/aws/karpenter/pkg/cloudprovider"
"github.com/aws/karpenter/pkg/controllers/interruption"
nodeclaimgarbagecollection "github.com/aws/karpenter/pkg/controllers/nodeclaim/garbagecollection"
nodeclaimtagging "github.com/aws/karpenter/pkg/controllers/nodeclaim/tagging"
"github.com/aws/karpenter/pkg/controllers/nodeclass"
ssminvalidation "github.com/aws/karpenter/pkg/controllers/providers/ssm/invalidation"
"github.com/aws/karpenter/pkg/operator/options"
"github.com/aws/karpenter/pkg/providers/amifamily"
"github.com/aws/karpenter/pkg/providers/instance"
Expand All @@ -43,15 +45,27 @@ import (
"github.com/aws/karpenter/pkg/providers/subnet"
)

func NewControllers(ctx context.Context, sess *session.Session, clk clock.Clock, kubeClient client.Client, recorder events.Recorder,
unavailableOfferings *cache.UnavailableOfferings, cloudProvider *cloudprovider.CloudProvider, subnetProvider *subnet.Provider,
securityGroupProvider *securitygroup.Provider, instanceProfileProvider *instanceprofile.Provider, instanceProvider *instance.Provider,
pricingProvider *pricing.Provider, amiProvider *amifamily.Provider) []controller.Controller {

func NewControllers(
ctx context.Context,
sess *session.Session,
clk clock.Clock,
kubeClient client.Client,
recorder events.Recorder,
unavailableOfferings *awscache.UnavailableOfferings,
ssmCache *cache.Cache,
cloudProvider *cloudprovider.CloudProvider,
subnetProvider *subnet.Provider,
securityGroupProvider *securitygroup.Provider,
instanceProfileProvider *instanceprofile.Provider,
instanceProvider *instance.Provider,
pricingProvider *pricing.Provider,
amiProvider *amifamily.Provider,
) []controller.Controller {
controllers := []controller.Controller{
nodeclass.NewNodeClassController(kubeClient, recorder, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider),
nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider),
nodeclaimtagging.NewController(kubeClient, instanceProvider),
ssminvalidation.NewController(ssmCache, amiProvider),
}
if options.FromContext(ctx).InterruptionQueue != "" {
controllers = append(controllers, interruption.NewController(kubeClient, clk, recorder, lo.Must(sqs.NewProvider(ctx, servicesqs.New(sess), options.FromContext(ctx).InterruptionQueue)), unavailableOfferings))
Expand Down
91 changes: 91 additions & 0 deletions pkg/controllers/providers/ssm/invalidation/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package invalidation

import (
"context"
"time"

"github.com/patrickmn/go-cache"
"github.com/samber/lo"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/karpenter/pkg/operator/controller"
"sigs.k8s.io/karpenter/pkg/operator/injection"

"github.com/aws/karpenter/pkg/apis/v1beta1"
"github.com/aws/karpenter/pkg/providers/amifamily"
"github.com/aws/karpenter/pkg/providers/ssm"
)

// The SSM Invalidation controller is responsible for invalidating "latest" SSM parameters when they point to deprecated
// AMIs. This can occur when an EKS-optimized AMI with a regression is released, and the AMI team chooses to deprecate
// the AMI. Normally, SSM parameter cache entries expire after 24 hours to prevent a thundering herd upon a new AMI
// release, however Karpenter should react faster when an AMI is deprecated. This controller will ensure Karpenter
// reacts to AMI deprecations within it's polling period (30m).
type Controller struct {
cache *cache.Cache
amiProvider *amifamily.Provider
}

func NewController(ssmCache *cache.Cache, amiProvider *amifamily.Provider) *Controller {
return &Controller{
cache: ssmCache,
amiProvider: amiProvider,
}
}

func (c *Controller) Name() string {
return "providers.ssm.invalidation"
}

func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, c.Name())

amiIDsToParameters := map[string]ssm.Parameter{}
for _, item := range c.cache.Items() {
entry := item.Object.(ssm.CacheEntry)
if !entry.Parameter.IsMutable {
continue
}
amiIDsToParameters[entry.Value] = entry.Parameter
}
amis := []amifamily.AMI{}
for _, nodeClass := range lo.Map(lo.Keys(amiIDsToParameters), func(amiID string, _ int) *v1beta1.EC2NodeClass {
return &v1beta1.EC2NodeClass{
Spec: v1beta1.EC2NodeClassSpec{
AMISelectorTerms: []v1beta1.AMISelectorTerm{{ID: amiID}},
},
}
}) {
resolvedAMIs, err := c.amiProvider.Get(ctx, nodeClass, nil)
if err != nil {
return reconcile.Result{}, err
}
amis = append(amis, resolvedAMIs...)
}
for _, ami := range amis {
if !ami.Deprecated {
continue
}
parameter := amiIDsToParameters[ami.AmiID]
c.cache.Delete(parameter.CacheKey())
}
return reconcile.Result{RequeueAfter: 30 * time.Minute}, nil
}

func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Builder {
return controller.NewSingletonManagedBy(m)
}
Loading

0 comments on commit cfa6aa5

Please sign in to comment.