Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-v0.6.x] fix: properly garbage collecting orphaned network interfaces (#642) #668

Open
wants to merge 3 commits into
base: release-v0.6.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,6 @@ func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute.

labels[corev1beta1.CapacityTypeLabelKey] = instance.GetCapacityType(vm)

// TODO: v1beta1 new kes/labels
if tag, ok := vm.Tags[instance.NodePoolTagKey]; ok {
labels[corev1beta1.NodePoolLabelKey] = *tag
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ var _ = Describe("CloudProvider", func() {
nodeClaims, _ := cloudProvider.List(ctx)
Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1))
queryRequest := azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Pop().Query
Expect(*queryRequest.Query).To(Equal(instance.GetListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String()))
Expect(*queryRequest.Query).To(Equal(instance.GetVMListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String()))
Expect(nodeClaims).To(HaveLen(1))
Expect(nodeClaims[0]).ToNot(BeNil())
resp, _ := azureEnv.VirtualMachinesAPI.Get(ctx, azureEnv.AzureResourceGraphAPI.ResourceGroup, nodeClaims[0].Name, nil)
Expand Down
6 changes: 5 additions & 1 deletion pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ import (
func NewControllers(ctx context.Context, kubeClient client.Client, cloudProvider *cloudprovider.CloudProvider, instanceProvider *instance.Provider) []controller.Controller {
logging.FromContext(ctx).With("version", project.Version).Debugf("discovered version")
controllers := []controller.Controller{
nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider),

nodeclaimgarbagecollection.NewVirtualMachine(kubeClient, cloudProvider),
nodeclaimgarbagecollection.NewNetworkInterface(kubeClient, instanceProvider),

// TODO: nodeclaim tagging
inplaceupdate.NewController(kubeClient, instanceProvider),
}
return controllers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,32 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/karpenter/pkg/operator/controller"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider"
"sigs.k8s.io/karpenter/pkg/operator/controller"
)

type Controller struct {
type VirtualMachine struct {
kubeClient client.Client
cloudProvider *cloudprovider.CloudProvider
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeueing near the start of the controller
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeuing near the start of the controller
}

func NewController(kubeClient client.Client, cloudProvider *cloudprovider.CloudProvider) *Controller {
return &Controller{
func NewVirtualMachine(kubeClient client.Client, cloudProvider *cloudprovider.CloudProvider) *VirtualMachine {
return &VirtualMachine{
kubeClient: kubeClient,
cloudProvider: cloudProvider,
successfulCount: 0,
}
}

func (c *Controller) Name() string {
return "nodeclaim.garbagecollection"
func (c *VirtualMachine) Name() string {
return "instance.garbagecollection"
}

func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
func (c *VirtualMachine) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
// We LIST VMs on the CloudProvider BEFORE we grab NodeClaims/Nodes on the cluster so that we make sure that, if
// LISTing instances takes a long time, our information is more updated by the time we get to nodeclaim and Node LIST
// This works since our CloudProvider instances are deleted based on whether the NodeClaim exists or not, not vice-versa
Expand Down Expand Up @@ -90,7 +90,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc
return reconcile.Result{RequeueAfter: lo.Ternary(c.successfulCount <= 20, time.Second*10, time.Minute*2)}, multierr.Combine(errs...)
}

func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodeList *v1.NodeList) error {
func (c *VirtualMachine) garbageCollect(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodeList *v1.NodeList) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("provider-id", nodeClaim.Status.ProviderID))
if err := c.cloudProvider.Delete(ctx, nodeClaim); err != nil {
return corecloudprovider.IgnoreNodeClaimNotFoundError(err)
Expand All @@ -109,6 +109,6 @@ func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *corev1beta1.
return nil
}

func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Builder {
func (c *VirtualMachine) Builder(_ context.Context, m manager.Manager) controller.Builder {
return controller.NewSingletonManagedBy(m)
}
110 changes: 110 additions & 0 deletions pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
Portions Copyright (c) Microsoft Corporation.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package garbagecollection

import (
"context"
"fmt"
"time"

"github.com/samber/lo"
"knative.dev/pkg/logging"

"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/util/workqueue"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"
"sigs.k8s.io/karpenter/pkg/operator/controller"

"github.com/Azure/karpenter-provider-azure/pkg/providers/instance"
)

const (
NicReservationDuration = time.Second * 180
// We set this interval at 5 minutes, as thats how often our NRP limits are reset.
// See: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling#network-throttling
NicGarbageCollectionInterval = time.Minute * 5
)

type NetworkInterface struct {
kubeClient client.Client
instanceProvider *instance.Provider
}

func NewNetworkInterface(kubeClient client.Client, instanceProvider *instance.Provider) *NetworkInterface {
return &NetworkInterface{
kubeClient: kubeClient,
instanceProvider: instanceProvider,
}
}

func (c *NetworkInterface) Name() string {
return "networkinterface.garbagecollection"
}

func (c *NetworkInterface) populateUnremovableInterfaces(ctx context.Context) (sets.Set[string], error) {
unremovableInterfaces := sets.New[string]()
vms, err := c.instanceProvider.List(ctx)
if err != nil {
return unremovableInterfaces, fmt.Errorf("listing VMs: %w", err)
}
for _, vm := range vms {
unremovableInterfaces.Insert(lo.FromPtr(vm.Name))
}
nodeClaimList := &corev1beta1.NodeClaimList{}
if err := c.kubeClient.List(ctx, nodeClaimList); err != nil {
return unremovableInterfaces, fmt.Errorf("listing NodeClaims for NIC GC: %w", err)
}

for _, nodeClaim := range nodeClaimList.Items {
unremovableInterfaces.Insert(instance.GenerateResourceName(nodeClaim.Name))
}
return unremovableInterfaces, nil
}

func (c *NetworkInterface) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
nics, err := c.instanceProvider.ListNics(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("listing NICs: %w", err)
}

unremovableInterfaces, err := c.populateUnremovableInterfaces(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("error listing resources needed to populate unremovable nics %w", err)
}
workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int) {
nicName := lo.FromPtr(nics[i].Name)
if !unremovableInterfaces.Has(nicName) {
err := c.instanceProvider.DeleteNic(ctx, nicName)
if err != nil {
logging.FromContext(ctx).Error(err)
return
}

logging.FromContext(ctx).With("nic", nicName).Infof("garbage collected NIC")
}
})
return reconcile.Result{
RequeueAfter: NicGarbageCollectionInterval,
}, nil
}

func (c *NetworkInterface) Builder(_ context.Context, m manager.Manager) controller.Builder {
return controller.NewSingletonManagedBy(m)
}
Loading
Loading