Skip to content

Commit d2bfaa3

Browse files
Managed GPU experience AFEC enablement flag (#7210)
Co-authored-by: Claude <[email protected]>
1 parent fd46cb8 commit d2bfaa3

File tree

109 files changed

+725
-127
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+725
-127
lines changed

aks-node-controller/parser/parser.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
8484
"MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())),
8585
"CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()),
8686
"ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetGpuDevicePlugin()),
87+
"MANAGED_GPU_EXPERIENCE_AFEC_ENABLED": fmt.Sprintf("%v", config.GetGpuConfig().GetManagedGpuExperienceAfecEnabled()),
8788
"TELEPORTD_PLUGIN_DOWNLOAD_URL": config.GetTeleportConfig().GetTeleportdPluginDownloadUrl(),
8889
"CREDENTIAL_PROVIDER_DOWNLOAD_URL": config.GetKubeBinaryConfig().GetLinuxCredentialProviderUrl(),
8990
"CONTAINERD_VERSION": config.GetContainerdConfig().GetContainerdVersion(),

aks-node-controller/pkg/gen/aksnodeconfig/v1/gpu_config.pb.go

Lines changed: 24 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

aks-node-controller/proto/aksnodeconfig/v1/gpu_config.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,7 @@ message GpuConfig {
1919

2020
// Same as enable_nvidia, but for AMD GPUs.
2121
optional bool enable_amd_gpu = 5;
22+
23+
// Specifies whether the managed GPU experience AFEC (Azure Feature Engineering Configuration) is enabled.
24+
bool managed_gpu_experience_afec_enabled = 6;
2225
}

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
4343
nbc.ConfigGPUDriverIfNeeded = true
4444
nbc.EnableGPUDevicePluginIfNeeded = true
4545
nbc.EnableNvidia = true
46+
nbc.ManagedGPUExperienceAFECEnabled = true
4647
},
4748
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
4849
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
@@ -98,6 +99,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
9899
nbc.ConfigGPUDriverIfNeeded = true
99100
nbc.EnableGPUDevicePluginIfNeeded = true
100101
nbc.EnableNvidia = true
102+
nbc.ManagedGPUExperienceAFECEnabled = true
101103
},
102104
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
103105
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
@@ -152,6 +154,7 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
152154
nbc.ConfigGPUDriverIfNeeded = true
153155
nbc.EnableGPUDevicePluginIfNeeded = true
154156
nbc.EnableNvidia = true
157+
nbc.ManagedGPUExperienceAFECEnabled = true
155158
},
156159
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
157160
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")

parts/linux/cloud-init/artifacts/cse_cmd.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ SGX_NODE={{GetVariable "sgxNode"}}
6363
MIG_NODE={{GetVariable "migNode"}}
6464
CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
6565
ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED={{GetVariable "enableGPUDevicePluginIfNeeded"}}
66+
MANAGED_GPU_EXPERIENCE_AFEC_ENABLED="{{IsManagedGPUExperienceAFECEnabled}}"
6667
TELEPORTD_PLUGIN_DOWNLOAD_URL={{GetParameter "teleportdPluginURL"}}
6768
CREDENTIAL_PROVIDER_DOWNLOAD_URL={{GetParameter "linuxCredentialProviderURL"}}
6869
CONTAINERD_VERSION={{GetParameter "containerdVersion"}}

pkg/agent/baker.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,9 @@ func getContainerServiceFuncMap(config *datamodel.NodeBootstrappingConfiguration
11261126
"IsArtifactStreamingEnabled": func() bool {
11271127
return config.EnableArtifactStreaming
11281128
},
1129+
"IsManagedGPUExperienceAFECEnabled": func() bool {
1130+
return config.ManagedGPUExperienceAFECEnabled
1131+
},
11291132
"EnableIMDSRestriction": func() bool {
11301133
return config.EnableIMDSRestriction
11311134
},

pkg/agent/baker_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,6 +2005,42 @@ oom_score = -999
20052005
// Verify GPU driver configuration is enabled
20062006
Expect(o.vars["CONFIG_GPU_DRIVER_IF_NEEDED"]).To(Equal("true"))
20072007
}),
2008+
Entry("AKSUbuntu2204 with ManagedGPUExperienceAFECEnabled", "AKSUbuntu2204+ManagedGPUExperienceAFEC", "1.29.7",
2009+
func(config *datamodel.NodeBootstrappingConfiguration) {
2010+
config.ContainerService.Properties.AgentPoolProfiles[0].KubernetesConfig = &datamodel.KubernetesConfig{
2011+
ContainerRuntime: datamodel.Containerd,
2012+
}
2013+
config.ContainerService.Properties.AgentPoolProfiles[0].Distro = datamodel.AKSUbuntuContainerd2204
2014+
config.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
2015+
config.EnableNvidia = true
2016+
config.ConfigGPUDriverIfNeeded = true
2017+
config.EnableGPUDevicePluginIfNeeded = true
2018+
config.ManagedGPUExperienceAFECEnabled = true
2019+
}, func(o *nodeBootstrappingOutput) {
2020+
// Verify ManagedGPUExperienceAFECEnabled is set
2021+
Expect(o.vars["MANAGED_GPU_EXPERIENCE_AFEC_ENABLED"]).To(Equal("true"))
2022+
// Verify other GPU settings are also correct
2023+
Expect(o.vars["GPU_NODE"]).To(Equal("true"))
2024+
Expect(o.vars["ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED"]).To(Equal("true"))
2025+
}),
2026+
Entry("AKSUbuntu2204 with ManagedGPUExperienceAFECEnabled disabled", "AKSUbuntu2204+ManagedGPUExperienceAFEC+Disabled", "1.29.7",
2027+
func(config *datamodel.NodeBootstrappingConfiguration) {
2028+
config.ContainerService.Properties.AgentPoolProfiles[0].KubernetesConfig = &datamodel.KubernetesConfig{
2029+
ContainerRuntime: datamodel.Containerd,
2030+
}
2031+
config.ContainerService.Properties.AgentPoolProfiles[0].Distro = datamodel.AKSUbuntuContainerd2204
2032+
config.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
2033+
config.EnableNvidia = true
2034+
config.ConfigGPUDriverIfNeeded = true
2035+
config.EnableGPUDevicePluginIfNeeded = true
2036+
config.ManagedGPUExperienceAFECEnabled = false
2037+
}, func(o *nodeBootstrappingOutput) {
2038+
// Verify ManagedGPUExperienceAFECEnabled is disabled
2039+
Expect(o.vars["MANAGED_GPU_EXPERIENCE_AFEC_ENABLED"]).To(Equal("false"))
2040+
// Verify other GPU settings are still correct
2041+
Expect(o.vars["GPU_NODE"]).To(Equal("true"))
2042+
Expect(o.vars["ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED"]).To(Equal("true"))
2043+
}),
20082044
Entry("CustomizedImage VHD should not have provision_start.sh", "CustomizedImage", "1.24.2",
20092045
func(c *datamodel.NodeBootstrappingConfiguration) {
20102046
c.ContainerService.Properties.AgentPoolProfiles[0].KubernetesConfig = &datamodel.KubernetesConfig{

pkg/agent/datamodel/types.go

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,26 +1744,27 @@ type GetLatestSigImageConfigRequest struct {
17441744

17451745
// NodeBootstrappingConfiguration represents configurations for node bootstrapping.
17461746
type NodeBootstrappingConfiguration struct {
1747-
ContainerService *ContainerService
1748-
CloudSpecConfig *AzureEnvironmentSpecConfig
1749-
K8sComponents *K8sComponents
1750-
AgentPoolProfile *AgentPoolProfile
1751-
TenantID string
1752-
SubscriptionID string
1753-
ResourceGroupName string
1754-
UserAssignedIdentityClientID string
1755-
OSSKU string
1756-
ConfigGPUDriverIfNeeded bool
1757-
Disable1804SystemdResolved bool
1758-
EnableGPUDevicePluginIfNeeded bool
1759-
EnableKubeletConfigFile bool
1760-
EnableNvidia bool
1761-
EnableAMDGPU bool
1762-
EnableACRTeleportPlugin bool
1763-
TeleportdPluginURL string
1764-
EnableArtifactStreaming bool
1765-
ContainerdVersion string
1766-
RuncVersion string
1747+
ContainerService *ContainerService
1748+
CloudSpecConfig *AzureEnvironmentSpecConfig
1749+
K8sComponents *K8sComponents
1750+
AgentPoolProfile *AgentPoolProfile
1751+
TenantID string
1752+
SubscriptionID string
1753+
ResourceGroupName string
1754+
UserAssignedIdentityClientID string
1755+
OSSKU string
1756+
ConfigGPUDriverIfNeeded bool
1757+
Disable1804SystemdResolved bool
1758+
EnableGPUDevicePluginIfNeeded bool
1759+
EnableKubeletConfigFile bool
1760+
EnableNvidia bool
1761+
EnableAMDGPU bool
1762+
ManagedGPUExperienceAFECEnabled bool
1763+
EnableACRTeleportPlugin bool
1764+
TeleportdPluginURL string
1765+
EnableArtifactStreaming bool
1766+
ContainerdVersion string
1767+
RuncVersion string
17671768
// ContainerdPackageURL and RuncPackageURL are beneficial for testing non-official.
17681769
// containerd and runc, like the pre-released ones.
17691770
// Currently both configurations are for test purpose, and only deb package is supported.

0 commit comments

Comments
 (0)