Skip to content

Commit

Permalink
fix: properly garbage collecting orphaned network interfaces (#642)
Browse files Browse the repository at this point in the history
* feat: adding ListNics to instanceprovider interface alongside a refactor of arg related functions to their own file

* feat: adding garbage collection logic for network interfaces and refactoring gc functions slightly

* feat: working poc of ARG Queries and nic garbage collection, need to fix tests

* fix: tests failing due to ListVM

* feat: split nic and vm into their own gc controllers, added shared state between them to prevent conflicts in nic deletion calls

* feat: Add DeleteNic option to instance provider

* docs(code-comment): adding clarification to unremovableNics

* test: instanceprovider.ListNics barebones test and arg fake list nic impl

* fix: bootstrap.sh

* feat: adding in VM List into the networkinterface.garbagecollection controller to avoid attempts to delete nics that are managed by a vm

* refactor: unremovableNics out of the vm gc controller in favor for a cleaner state list

* fix:updating references to cache

* Update pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go

* test: adding composable network interface options to our test utils based on existing karp-core pattern

* test: adding test we don't include unmanaged nics

* test: adding network garbage collection suite and happy path

* test: adding tests for unremovable nics

* test: adding coverage that vm controller cleans up nics

* refactor: renaming controller

* fix: refactor name

* refactor: using import directly

* ci: checking error for controller

* fix: ci

* fix: addressing comments

* Update pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go

* refactor: removing name constant

* refactor: moving to test utils

* fix: removing GetZoneID

* style: improving the readability of the network interface garbage collection tests

* revert: removing lo.FromPtr checks for nodeclaim creation to avoid creating nodeclaims with empty required properties

* refactor: VirtualmachineProperties needs a default for time created

* fix: modifying the tests to be aware of time created to properly simulate vm gc

* refactor: added nodepoolName to test.VirtualMachine and test.Interface

* refactor: moving vm gc tests to use test.VirtualMachine

* refactor: renaming arg files to azureresourcegraph

* refactor: using deleteIfNicExists directly in cleanupAzureResources

* test: createNicFromQueryResponseData missing id, missing name and happy case

* refactor: using fake.Region for the default region for test.VirtualMachine and test.NetworkInterface

* fix: using input.InterfaceName rather than the outer scope interface name

* test: modifying fake to only initialize the query once
  • Loading branch information
Bryce-Soghigian committed Jan 30, 2025
1 parent 9cc27c0 commit 6692984
Show file tree
Hide file tree
Showing 20 changed files with 617 additions and 397 deletions.
1 change: 0 additions & 1 deletion pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,6 @@ func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute.

labels[corev1beta1.CapacityTypeLabelKey] = instance.GetCapacityType(vm)

// TODO: v1beta1 new kes/labels
if tag, ok := vm.Tags[instance.NodePoolTagKey]; ok {
labels[corev1beta1.NodePoolLabelKey] = *tag
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ var _ = Describe("CloudProvider", func() {
nodeClaims, _ := cloudProvider.List(ctx)
Expect(azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Len()).To(Equal(1))
queryRequest := azureEnv.AzureResourceGraphAPI.AzureResourceGraphResourcesBehavior.CalledWithInput.Pop().Query
Expect(*queryRequest.Query).To(Equal(instance.GetListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String()))
Expect(*queryRequest.Query).To(Equal(instance.GetVMListQueryBuilder(azureEnv.AzureResourceGraphAPI.ResourceGroup).String()))
Expect(nodeClaims).To(HaveLen(1))
Expect(nodeClaims[0]).ToNot(BeNil())
resp, _ := azureEnv.VirtualMachinesAPI.Get(ctx, azureEnv.AzureResourceGraphAPI.ResourceGroup, nodeClaims[0].Name, nil)
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,14 @@ import (
func NewControllers(ctx context.Context, kubeClient client.Client, cloudProvider *cloudprovider.CloudProvider, instanceProvider *instance.Provider) []controller.Controller {
logging.FromContext(ctx).With("version", project.Version).Debugf("discovered version")
controllers := []controller.Controller{
nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider),
nodeclasshash.NewController(kubeClient),
nodeclassstatus.NewController(kubeClient),
nodeclasstermination.NewController(kubeClient, recorder),

nodeclaimgarbagecollection.NewVirtualMachine(kubeClient, cloudProvider),
nodeclaimgarbagecollection.NewNetworkInterface(kubeClient, instanceProvider),

// TODO: nodeclaim tagging
inplaceupdate.NewController(kubeClient, instanceProvider),
}
return controllers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ import (
"fmt"
"time"

"github.com/Azure/karpenter-provider-azure/pkg/cloudprovider"
"sigs.k8s.io/karpenter/pkg/operator/injection"
controllerruntime "sigs.k8s.io/controller-runtime"
"github.com/awslabs/operatorpkg/singleton"

"github.com/samber/lo"
"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
Expand All @@ -35,28 +38,25 @@ import (
corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider"
"sigs.k8s.io/karpenter/pkg/operator/controller"
)

type Controller struct {
type VirtualMachine struct {
kubeClient client.Client
cloudProvider *cloudprovider.CloudProvider
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeueing near the start of the controller
cloudProvider corecloudprovider.CloudProvider
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeuing near the start of the controller
}

func NewController(kubeClient client.Client, cloudProvider *cloudprovider.CloudProvider) *Controller {
return &Controller{
func NewVirtualMachine(kubeClient client.Client, cloudProvider corecloudprovider.CloudProvider) *VirtualMachine {
return &VirtualMachine{
kubeClient: kubeClient,
cloudProvider: cloudProvider,
successfulCount: 0,
}
}

func (c *Controller) Name() string {
return "nodeclaim.garbagecollection"
}
func (c *VirtualMachine) Reconcile(ctx context.Context) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "instance.garbagecollection")

func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
// We LIST VMs on the CloudProvider BEFORE we grab NodeClaims/Nodes on the cluster so that we make sure that, if
// LISTing instances takes a long time, our information is more updated by the time we get to nodeclaim and Node LIST
// This works since our CloudProvider instances are deleted based on whether the NodeClaim exists or not, not vice-versa
Expand Down Expand Up @@ -90,7 +90,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc
return reconcile.Result{RequeueAfter: lo.Ternary(c.successfulCount <= 20, time.Second*10, time.Minute*2)}, multierr.Combine(errs...)
}

func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodeList *v1.NodeList) error {
func (c *VirtualMachine) garbageCollect(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodeList *v1.NodeList) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("provider-id", nodeClaim.Status.ProviderID))
if err := c.cloudProvider.Delete(ctx, nodeClaim); err != nil {
return corecloudprovider.IgnoreNodeClaimNotFoundError(err)
Expand All @@ -109,6 +109,9 @@ func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *corev1beta1.
return nil
}

func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Builder {
return controller.NewSingletonManagedBy(m)
func (c *VirtualMachine) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named("instance.garbagecollection").
WatchesRawSource(singleton.Source()).
Complete(singleton.AsReconciler(c))
}
112 changes: 112 additions & 0 deletions pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
Portions Copyright (c) Microsoft Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package garbagecollection

import (
"context"
"fmt"
"time"

"github.com/samber/lo"
"knative.dev/pkg/logging"

"github.com/awslabs/operatorpkg/singleton"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/util/workqueue"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/karpenter/pkg/operator/injection"
corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/Azure/karpenter-provider-azure/pkg/providers/instance"
)

const (
NicReservationDuration = time.Second * 180
// We set this interval at 5 minutes, as thats how often our NRP limits are reset.
// See: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling#network-throttling
NicGarbageCollectionInterval = time.Minute * 5
)

type NetworkInterface struct {
kubeClient client.Client
instanceProvider instance.Provider
}

func NewNetworkInterface(kubeClient client.Client, instanceProvider instance.Provider) *NetworkInterface {
return &NetworkInterface{
kubeClient: kubeClient,
instanceProvider: instanceProvider,
}
}

func (c *NetworkInterface) populateUnremovableInterfaces(ctx context.Context) (sets.Set[string], error) {
unremovableInterfaces := sets.New[string]()
vms, err := c.instanceProvider.List(ctx)
if err != nil {
return unremovableInterfaces, fmt.Errorf("listing VMs: %w", err)
}
for _, vm := range vms {
unremovableInterfaces.Insert(lo.FromPtr(vm.Name))
}
nodeClaimList := &corev1beta1.NodeClaimList{}
if err := c.kubeClient.List(ctx, nodeClaimList); err != nil {
return unremovableInterfaces, fmt.Errorf("listing NodeClaims for NIC GC: %w", err)
}

for _, nodeClaim := range nodeClaimList.Items {
unremovableInterfaces.Insert(instance.GenerateResourceName(nodeClaim.Name))
}
return unremovableInterfaces, nil
}

func (c *NetworkInterface) Reconcile(ctx context.Context) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "networkinterface.garbagecollection")
nics, err := c.instanceProvider.ListNics(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("listing NICs: %w", err)
}

unremovableInterfaces, err := c.populateUnremovableInterfaces(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("error listing resources needed to populate unremovable nics %w", err)
}
workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int) {
nicName := lo.FromPtr(nics[i].Name)
if !unremovableInterfaces.Has(nicName) {
err := c.instanceProvider.DeleteNic(ctx, nicName)
if err != nil {
logging.FromContext(ctx).Error(err)
return
}

logging.FromContext(ctx).With("nic", nicName).Infof("garbage collected NIC")
}
})
return reconcile.Result{
RequeueAfter: NicGarbageCollectionInterval,
}, nil
}

func (c *NetworkInterface) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named("networkinterface.garbagecollection").
WatchesRawSource(singleton.Source()).
Complete(singleton.AsReconciler(c))
}
Loading

0 comments on commit 6692984

Please sign in to comment.