Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions cni/network/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"github.com/Azure/azure-container-networking/cni/util"
"github.com/Azure/azure-container-networking/cns"
cnscli "github.com/Azure/azure-container-networking/cns/client"
"github.com/Azure/azure-container-networking/cns/fsnotify"
"github.com/Azure/azure-container-networking/common"
"github.com/Azure/azure-container-networking/dhcp"
"github.com/Azure/azure-container-networking/iptables"
Expand Down Expand Up @@ -716,7 +715,7 @@ func (plugin *NetPlugin) createEpInfo(opt *createEpInfoOpt) (*network.EndpointIn
*opt.infraSeen = true
} else {
ifName = "eth" + strconv.Itoa(opt.endpointIndex)
endpointID = plugin.nm.GetEndpointID(opt.args.ContainerID, ifName)
endpointID = plugin.nm.GetEndpointIDByNicType(opt.args.ContainerID, ifName, opt.ifInfo.NICType)
}

endpointInfo := network.EndpointInfo{
Expand Down Expand Up @@ -1069,32 +1068,45 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
if plugin.nm.IsStatelessCNIMode() {
// network ID is passed in and used only for migration
// otherwise, in stateless, we don't need the network id for deletion
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID)
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID, args.Netns)
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure
// return a retriable error so the container runtime will retry this DEL later
// the implementation of this function returns nil if the endpoint doesn't exist, so
// we don't have to check that here
if err != nil {
if errors.Is(err, network.ErrConnectionFailure) {
logger.Info("failed to connect to CNS", zap.String("containerID", args.ContainerID), zap.Error(err))
addErr := fsnotify.AddFile(args.ContainerID, args.ContainerID, watcherPath)
logger.Info("add containerid file for Asynch delete", zap.String("containerID", args.ContainerID), zap.Error(addErr))
if addErr != nil {
logger.Error("failed to add file to watcher", zap.String("containerID", args.ContainerID), zap.Error(addErr))
return errors.Wrap(addErr, fmt.Sprintf("failed to add file to watcher with containerID %s", args.ContainerID))
switch {
case errors.Is(err, network.ErrConnectionFailure):
logger.Error("Failed to connect to CNS", zap.Error(err))
logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID))
// In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS,
// we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state.
// This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations.
// When that happens, kubelet and containerd hang during sandbox creation or teardown.
// The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace,
// triggering hot-unplug/re-register events and leaving the node in an unhealthy state.
// This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state.
if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should still create file for cns to release ip allocated for pod

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in the new commit

logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err))
return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err))
}
return nil
}
if errors.Is(err, network.ErrEndpointStateNotFound) {
case errors.Is(err, network.ErrEndpointStateNotFound):
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
return nil
default:
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
}
} else {
for _, epInfo := range epInfos {
logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType))
}
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
}
} else {
epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID)
}

// for when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
// this block is not applied to stateless CNI
// for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
// this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file
Copy link
Contributor

@QxBytes QxBytes Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No description provided.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you clarify the comment (1109-1110) to:
the following block applies to:

  • stateful when ...
  • stateless when ...

if len(epInfos) == 0 {
endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName)
if !nwCfg.MultiTenancy {
Expand All @@ -1120,7 +1132,7 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
if err = plugin.nm.DeleteEndpoint(epInfo.NetworkID, epInfo.EndpointID, epInfo); err != nil {
// An error will not be returned if the endpoint is not found
// return a retriable error so the container runtime will retry this DEL later
// the implementation of this function returns nil if the endpoint doens't exist, so
// the implementation of this function returns nil if the endpoint doesn't exist, so
// we don't have to check that here
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
}
Expand Down
9 changes: 9 additions & 0 deletions network/endpoint_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -547,3 +547,12 @@ func getDefaultGateway(routes []RouteInfo) net.IP {
func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*EndpointInfo, error) {
return epInfo, nil
}

// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace.
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error {
secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep)
if err := secondaryepClient.RemoveInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil {
return err
}
return nil
}
5 changes: 5 additions & 0 deletions network/endpoint_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -746,3 +746,8 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri
logger.Info("Retrieved device problem code", zap.String("code", devpkeyDeviceProblemCode))
return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil
}

// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace.
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error {
return nil
}
43 changes: 36 additions & 7 deletions network/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,13 @@ type NetworkManager interface {
UpdateEndpoint(networkID string, existingEpInfo *EndpointInfo, targetEpInfo *EndpointInfo) error
GetNumberOfEndpoints(ifName string, networkID string) int
GetEndpointID(containerID, ifName string) string
GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: GetEndpointIDByNICType as nic is an acronym

IsStatelessCNIMode() bool
SaveState(eps []*endpoint) error
DeleteState(epInfos []*EndpointInfo) error
GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo
GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error)
GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error)
RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error
}

// Creates a new network manager.
Expand Down Expand Up @@ -455,7 +457,7 @@ func validateUpdateEndpointState(endpointID string, ifNameToIPInfoMap map[string
// GetEndpointState will make a call to CNS GetEndpointState API in the stateless CNI mode to fetch the endpointInfo
// TODO unit tests need to be added, WorkItem: 26606939
// In stateless cni, container id is the endpoint id, so you can pass in either
func (nm *networkManager) GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error) {
func (nm *networkManager) GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error) {
endpointResponse, err := nm.CnsClient.GetEndpoint(context.TODO(), containerID)
if err != nil {
if endpointResponse.Response.ReturnCode == types.NotFound {
Expand All @@ -466,7 +468,7 @@ func (nm *networkManager) GetEndpointState(networkID, containerID string) ([]*En
}
return nil, ErrGetEndpointStateFailure
}
epInfos := cnsEndpointInfotoCNIEpInfos(endpointResponse.EndpointInfo, containerID)
epInfos := cnsEndpointInfotoCNIEpInfos(endpointResponse.EndpointInfo, containerID, netns)

for i := 0; i < len(epInfos); i++ {
if epInfos[i].NICType == cns.InfraNIC {
Expand Down Expand Up @@ -514,7 +516,7 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
nw := &network{
Id: networkID, // currently unused in stateless cni
HnsId: epInfo.HNSNetworkID,
Mode: opModeTransparentVlan,
Mode: opModeTransparent,
SnatBridgeIP: "",
NetNs: dummyGUID, // to trigger hns v2, windows
extIf: &externalInterface{
Expand All @@ -529,6 +531,7 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
HNSNetworkID: epInfo.HNSNetworkID, // unused (we use nw.HnsId for deleting the network)
HostIfName: epInfo.HostIfName,
LocalIP: "",
IPAddresses: epInfo.IPAddresses,
VlanID: 0,
AllowInboundFromHostToNC: false, // stateless currently does not support apipa
AllowInboundFromNCToHost: false,
Expand All @@ -537,11 +540,12 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
NetworkContainerID: epInfo.NetworkContainerID, // we don't use this as long as AllowInboundFromHostToNC and AllowInboundFromNCToHost are false
NetNs: dummyGUID, // to trigger hnsv2, windows
NICType: epInfo.NICType,
NetworkNameSpace: epInfo.NetNsPath,
IfName: epInfo.IfName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
}
logger.Info("Deleting endpoint with", zap.String("Endpoint Info: ", epInfo.PrettyString()), zap.String("HNISID : ", ep.HnsId))

err := nw.deleteEndpointImpl(netlink.NewNetlink(), platform.NewExecClient(logger), nil, nil, nil, nil, nil, ep)
err := nw.deleteEndpointImpl(nm.netlink, nm.plClient, nil, nm.netio, nm.nsClient, nm.iptablesClient, nm.dhcpClient, ep)
if err != nil {
return err
}
Expand All @@ -562,7 +566,7 @@ func (nm *networkManager) GetEndpointInfo(networkID, endpointID string) (*Endpoi

if nm.IsStatelessCNIMode() {
logger.Info("calling cns getEndpoint API")
epInfos, err := nm.GetEndpointState(networkID, endpointID)
epInfos, err := nm.GetEndpointState(networkID, endpointID, "")
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -745,6 +749,16 @@ func (nm *networkManager) GetEndpointID(containerID, ifName string) string {
return containerID + "-" + ifName
}

// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type.
func (nm *networkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string {
// For stateless CNI, secondary NICs use containerID-ifName as endpointID.
if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for stateful cni, this is not an issue? what's the impact if we remove statelesscnimode check here?

return containerID + "-" + ifName
}
// For InfraNIC, use GetEndpointID() logic.
return nm.GetEndpointID(containerID, ifName)
}

// saves the map of network ids to endpoints to the state file
func (nm *networkManager) SaveState(eps []*endpoint) error {
nm.Lock()
Expand Down Expand Up @@ -779,7 +793,7 @@ func (nm *networkManager) DeleteState(_ []*EndpointInfo) error {
}

// called to convert a cns restserver EndpointInfo into a network EndpointInfo
func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointID string) []*EndpointInfo {
func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointID, netns string) []*EndpointInfo {
ret := []*EndpointInfo{}

for ifName, ipInfo := range endpointInfo.IfnameToIPMap {
Expand Down Expand Up @@ -809,6 +823,10 @@ func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointI
epInfo.NICType = ipInfo.NICType
epInfo.HNSNetworkID = ipInfo.HnsNetworkID
epInfo.MacAddress = net.HardwareAddr(ipInfo.MacAddress)
// fill out the netns if it is empty via args passed by container runtime
if epInfo.NetNsPath == "" {
epInfo.NetNsPath = netns
}
ret = append(ret, epInfo)
}
return ret
Expand Down Expand Up @@ -847,3 +865,14 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo {

return ifNametoIPInfoMap
}

// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns
func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error {
ep := &endpoint{
NetworkNameSpace: netns,
IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
}
logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns))
err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient)
return err
}
17 changes: 16 additions & 1 deletion network/manager_mock.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package network

import (
"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/common"
)

Expand Down Expand Up @@ -94,6 +95,16 @@ func (nm *MockNetworkManager) GetEndpointID(containerID, ifName string) string {
return containerID + "-" + ifName
}

// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type.
func (nm *MockNetworkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string {
// For stateless CNI, secondary NICs use containerID-ifName as endpointID.
if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC {
return containerID + "-" + ifName
}
// For InfraNIC, use GetEndpointID() logic.
return nm.GetEndpointID(containerID, ifName)
}

func (nm *MockNetworkManager) GetAllEndpoints(networkID string) (map[string]*EndpointInfo, error) {
return nm.TestEndpointInfoMap, nil
}
Expand Down Expand Up @@ -207,6 +218,10 @@ func (nm *MockNetworkManager) GetEndpointInfosFromContainerID(containerID string
return ret
}

func (nm *MockNetworkManager) GetEndpointState(_, _ string) ([]*EndpointInfo, error) {
func (nm *MockNetworkManager) GetEndpointState(_, _, _ string) ([]*EndpointInfo, error) {
return []*EndpointInfo{}, nil
}

func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error {
return nil
}
110 changes: 108 additions & 2 deletions network/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ var _ = Describe("Test Manager", func() {
PodNamespace: "test-pod-ns",
}

epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID)
epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID, "")

Expect(len(epInfos)).To(Equal(1))
Expect(epInfos[0]).To(Equal(
Expand Down Expand Up @@ -400,7 +400,7 @@ var _ = Describe("Test Manager", func() {
PodNamespace: "test-pod-ns",
}

epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID)
epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID, "")

Expect(len(epInfos)).To(Equal(2))
Expect(epInfos).To(ContainElement(
Expand Down Expand Up @@ -489,3 +489,109 @@ var _ = Describe("Test Manager", func() {
})
})
})

func TestGetEndpointIDByNicType_Cases(t *testing.T) {
nm := &networkManager{}

cases := []struct {
name string
stateless bool
containerID string
ifName string
nicType cns.NICType
expectedResult string
}{
{
name: "Stateless InfraNIC",
stateless: true,
containerID: "container123",
ifName: "eth0",
nicType: cns.InfraNIC,
expectedResult: "container123",
},
{
name: "Stateless SecondaryNIC",
stateless: true,
containerID: "container123",
ifName: "eth1",
nicType: cns.DelegatedVMNIC,
expectedResult: "container123-eth1",
},
{
name: "Stateful InfraNIC",
stateless: false,
containerID: "container123456789",
ifName: "eth0",
nicType: cns.InfraNIC,
expectedResult: "containe-eth0", // truncated to 8 chars
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
nm.statelessCniMode = tc.stateless
id := nm.GetEndpointIDByNicType(tc.containerID, tc.ifName, tc.nicType)
if id != tc.expectedResult {
t.Errorf("expected %s, got %s", tc.expectedResult, id)
}
})
}
}

func TestCnsEndpointInfotoCNIEpInfos_Cases(t *testing.T) {
cases := []struct {
name string
ifName string
ipInfo restserver.IPInfo
netNs string
expectedNetNs string
expectedIfName string
expectedNICType cns.NICType
}{
{
name: "DelegatedVMNIC",
ifName: "eth1",
netNs: "/var/run/netns/testns",
ipInfo: restserver.IPInfo{
NICType: cns.DelegatedVMNIC,
},
expectedNetNs: "/var/run/netns/testns",
expectedIfName: "eth1",
expectedNICType: cns.DelegatedVMNIC,
},
{
name: "InfraNIC",
ifName: "eth0",
netNs: "",
ipInfo: restserver.IPInfo{
NICType: cns.InfraNIC,
},
expectedNetNs: "",
expectedIfName: "eth0",
expectedNICType: cns.InfraNIC,
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
endpointInfo := restserver.EndpointInfo{
IfnameToIPMap: map[string]*restserver.IPInfo{
tc.ifName: &tc.ipInfo,
},
}
epInfos := cnsEndpointInfotoCNIEpInfos(endpointInfo, "container123", tc.netNs)
if len(epInfos) == 0 {
t.Fatalf("expected at least one epInfo")
}
if epInfos[0].NetNsPath != tc.expectedNetNs {
t.Errorf("expected NetNsPath %q, got %q", tc.expectedNetNs, epInfos[0].NetNsPath)
}
if epInfos[0].IfName != tc.expectedIfName {
t.Errorf("expected IfName %q, got %q", tc.expectedIfName, epInfos[0].IfName)
}
if epInfos[0].NICType != tc.expectedNICType {
t.Errorf("expected NICType %v, got %v", tc.expectedNICType, epInfos[0].NICType)
}
})
}
}
Loading
Loading