-
Notifications
You must be signed in to change notification settings - Fork 69
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: properly garbage collecting orphaned network interfaces (#642)
* feat: adding ListNics to instanceprovider interface alongside a refactor of arg related functions to their own file * feat: adding garbage collection logic for network interfaces and refactoring gc functions slightly * feat: working poc of ARG Queries and nic garbage collection, need to fix tests * fix: tests failing due to ListVM * feat: split nic and vm into their own gc controllers, added shared state between them to prevent conflicts in nic deletion calls * feat: Add DeleteNic option to instance provider * docs(code-comment): adding clarification to unremovableNics * test: instanceprovider.ListNics barebones test and arg fake list nic impl * fix: bootstrap.sh * feat: adding in VM List into the networkinterface.garbagecollection controller to avoid attempts to delete nics that are managed by a vm * refactor: unremovableNics out of the vm gc controller in favor for a cleaner state list * fix:updating references to cache * Update pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go * test: adding composable network interface options to our test utils based on existing karp-core pattern * test: adding test we don't include unmanaged nics * test: adding network garbage collection suite and happy path * test: adding tests for unremovable nics * test: adding coverage that vm controller cleans up nics * refactor: renaming controller * fix: refactor name * refactor: using import directly * ci: checking error for controller * fix: ci * fix: addressing comments * Update pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go * refactor: removing name constant * refactor: moving to test utils * fix: removing GetZoneID * style: improving the readability of the network interface garbage collection tests * revert: removing lo.FromPtr checks for nodeclaim creation to avoid creating nodeclaims with empty required properties * refactor: VirtualmachineProperties needs a default for time created * fix: modifying the tests to be aware of time created to properly simulate vm gc * refactor: added nodepoolName to test.VirtualMachine and test.Interface * refactor: moving vm gc tests to use test.VirtualMachine * refactor: renaming arg files to azureresourcegraph * refactor: using deleteIfNicExists directly in cleanupAzureResources * test: createNicFromQueryResponseData missing id, missing name and happy case * refactor: using fake.Region for the default region for test.VirtualMachine and test.NetworkInterface * fix: using input.InterfaceName rather than the outer scope interface name * test: modifying fake to only initialize the query once
- Loading branch information
1 parent
9cc27c0
commit 6692984
Showing
20 changed files
with
617 additions
and
397 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
112 changes: 112 additions & 0 deletions
112
pkg/controllers/nodeclaim/garbagecollection/nic_garbagecollection.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
/* | ||
Portions Copyright (c) Microsoft Corporation. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package garbagecollection | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/samber/lo" | ||
"knative.dev/pkg/logging" | ||
|
||
"github.com/awslabs/operatorpkg/singleton" | ||
"k8s.io/apimachinery/pkg/util/sets" | ||
"k8s.io/client-go/util/workqueue" | ||
controllerruntime "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/manager" | ||
"sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
"sigs.k8s.io/karpenter/pkg/operator/injection" | ||
corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1" | ||
|
||
"github.com/Azure/karpenter-provider-azure/pkg/providers/instance" | ||
) | ||
|
||
const ( | ||
NicReservationDuration = time.Second * 180 | ||
// We set this interval at 5 minutes, as thats how often our NRP limits are reset. | ||
// See: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling#network-throttling | ||
NicGarbageCollectionInterval = time.Minute * 5 | ||
) | ||
|
||
type NetworkInterface struct { | ||
kubeClient client.Client | ||
instanceProvider instance.Provider | ||
} | ||
|
||
func NewNetworkInterface(kubeClient client.Client, instanceProvider instance.Provider) *NetworkInterface { | ||
return &NetworkInterface{ | ||
kubeClient: kubeClient, | ||
instanceProvider: instanceProvider, | ||
} | ||
} | ||
|
||
func (c *NetworkInterface) populateUnremovableInterfaces(ctx context.Context) (sets.Set[string], error) { | ||
unremovableInterfaces := sets.New[string]() | ||
vms, err := c.instanceProvider.List(ctx) | ||
if err != nil { | ||
return unremovableInterfaces, fmt.Errorf("listing VMs: %w", err) | ||
} | ||
for _, vm := range vms { | ||
unremovableInterfaces.Insert(lo.FromPtr(vm.Name)) | ||
} | ||
nodeClaimList := &corev1beta1.NodeClaimList{} | ||
if err := c.kubeClient.List(ctx, nodeClaimList); err != nil { | ||
return unremovableInterfaces, fmt.Errorf("listing NodeClaims for NIC GC: %w", err) | ||
} | ||
|
||
for _, nodeClaim := range nodeClaimList.Items { | ||
unremovableInterfaces.Insert(instance.GenerateResourceName(nodeClaim.Name)) | ||
} | ||
return unremovableInterfaces, nil | ||
} | ||
|
||
func (c *NetworkInterface) Reconcile(ctx context.Context) (reconcile.Result, error) { | ||
ctx = injection.WithControllerName(ctx, "networkinterface.garbagecollection") | ||
nics, err := c.instanceProvider.ListNics(ctx) | ||
if err != nil { | ||
return reconcile.Result{}, fmt.Errorf("listing NICs: %w", err) | ||
} | ||
|
||
unremovableInterfaces, err := c.populateUnremovableInterfaces(ctx) | ||
if err != nil { | ||
return reconcile.Result{}, fmt.Errorf("error listing resources needed to populate unremovable nics %w", err) | ||
} | ||
workqueue.ParallelizeUntil(ctx, 100, len(nics), func(i int) { | ||
nicName := lo.FromPtr(nics[i].Name) | ||
if !unremovableInterfaces.Has(nicName) { | ||
err := c.instanceProvider.DeleteNic(ctx, nicName) | ||
if err != nil { | ||
logging.FromContext(ctx).Error(err) | ||
return | ||
} | ||
|
||
logging.FromContext(ctx).With("nic", nicName).Infof("garbage collected NIC") | ||
} | ||
}) | ||
return reconcile.Result{ | ||
RequeueAfter: NicGarbageCollectionInterval, | ||
}, nil | ||
} | ||
|
||
func (c *NetworkInterface) Register(_ context.Context, m manager.Manager) error { | ||
return controllerruntime.NewControllerManagedBy(m). | ||
Named("networkinterface.garbagecollection"). | ||
WatchesRawSource(singleton.Source()). | ||
Complete(singleton.AsReconciler(c)) | ||
} |
Oops, something went wrong.