Skip to content

Commit 825d989

Browse files
committed
Add e2e test for NVIDIA MIG support on Ubuntu 24.04
- Add Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG to validate MIG functionality - Configure test with Standard_NC24ads_A100_v4 VM size and MIG2g instance profile - Add ValidateMIGModeEnabled validator to check MIG mode is enabled via nvidia-smi - Add ValidateMIGInstancesCreated validator to verify MIG instances are properly created - Test validates device plugin, DCGM exporter, and GPU resource scheduling with MIG Signed-off-by: Suraj Deshmukh <[email protected]>
1 parent 6320eaf commit 825d989

File tree

2 files changed

+98
-0
lines changed

2 files changed

+98
-0
lines changed

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,65 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
190190
},
191191
})
192192
}
193+
194+
func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) {
195+
RunScenario(t, &Scenario{
196+
Description: "Tests that NVIDIA device plugin and DCGM Exporter work with MIG enabled on Ubuntu 24.04 GPU nodes",
197+
Tags: Tags{
198+
GPU: true,
199+
},
200+
Config: Config{
201+
Cluster: ClusterKubenet,
202+
VHD: config.VHDUbuntu2404Gen2Containerd,
203+
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
204+
nbc.AgentPoolProfile.VMSize = "Standard_NC24ads_A100_v4"
205+
nbc.ConfigGPUDriverIfNeeded = true
206+
nbc.EnableGPUDevicePluginIfNeeded = true
207+
nbc.EnableNvidia = true
208+
nbc.GPUInstanceProfile = "MIG2g"
209+
},
210+
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
211+
vmss.SKU.Name = to.Ptr("Standard_NC24ads_A100_v4")
212+
if vmss.Tags == nil {
213+
vmss.Tags = map[string]*string{}
214+
}
215+
vmss.Tags["EnableManagedGPUExperience"] = to.Ptr("true")
216+
},
217+
Validator: func(ctx context.Context, s *Scenario) {
218+
os := "ubuntu"
219+
osVersion := "r2404"
220+
221+
// Validate that the NVIDIA device plugin binary was installed correctly
222+
versions := components.GetExpectedPackageVersions("nvidia-device-plugin", os, osVersion)
223+
require.Lenf(s.T, versions, 1, "Expected exactly one nvidia-device-plugin version for %s %s but got %d", os, osVersion, len(versions))
224+
ValidateInstalledPackageVersion(ctx, s, "nvidia-device-plugin", versions[0])
225+
226+
// Validate that the NVIDIA device plugin systemd service is running
227+
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
228+
229+
// Validate that MIG mode is enabled via nvidia-smi
230+
ValidateMIGModeEnabled(ctx, s)
231+
232+
// Validate that MIG instances are created
233+
ValidateMIGInstancesCreated(ctx, s, "MIG 2g.20gb")
234+
235+
// Validate that GPU resources are advertised by the device plugin
236+
ValidateNodeAdvertisesGPUResources(ctx, s, 3)
237+
238+
// Validate that MIG workloads can be scheduled
239+
ValidateGPUWorkloadSchedulable(ctx, s, 3)
240+
241+
// Validate that the NVIDIA DCGM packages were installed correctly
242+
for _, packageName := range getDCGMPackageNames(os) {
243+
versions := components.GetExpectedPackageVersions(packageName, os, osVersion)
244+
require.Lenf(s.T, versions, 1, "Expected exactly one %s version for %s %s but got %d", packageName, os, osVersion, len(versions))
245+
ValidateInstalledPackageVersion(ctx, s, packageName, versions[0])
246+
}
247+
248+
ValidateNvidiaDCGMExporterSystemDServiceRunning(ctx, s)
249+
ValidateNvidiaDCGMExporterIsScrapable(ctx, s)
250+
ValidateNvidiaDCGMExporterScrapeCommonMetric(ctx, s)
251+
},
252+
},
253+
})
254+
}

e2e/validators.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,3 +1182,39 @@ func ValidateNvidiaDCGMExporterScrapeCommonMetric(ctx context.Context, s *Scenar
11821182
}
11831183
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "Nvidia DCGM Exporter is not returning DCGM_FI_DEV_GPU_UTIL")
11841184
}
1185+
1186+
func ValidateMIGModeEnabled(ctx context.Context, s *Scenario) {
1187+
s.T.Helper()
1188+
s.T.Logf("validating that MIG mode is enabled")
1189+
1190+
command := []string{
1191+
"set -ex",
1192+
// Grep to verify it contains 'Enabled' - this will fail if MIG is disabled
1193+
"sudo nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep -i 'Enabled'",
1194+
}
1195+
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "MIG mode is not enabled")
1196+
1197+
stdout := strings.TrimSpace(execResult.stdout.String())
1198+
s.T.Logf("MIG mode status: %s", stdout)
1199+
require.Contains(s.T, stdout, "Enabled", "expected MIG mode to be enabled, but got: %s", stdout)
1200+
s.T.Logf("MIG mode is enabled")
1201+
}
1202+
1203+
func ValidateMIGInstancesCreated(ctx context.Context, s *Scenario, migProfile string) {
1204+
s.T.Helper()
1205+
s.T.Logf("validating that MIG instances are created with profile %s", migProfile)
1206+
1207+
command := []string{
1208+
"set -ex",
1209+
// List MIG devices using nvidia-smi
1210+
"sudo nvidia-smi mig -lgi",
1211+
// Ensure the output contains the expected MIG profile (will fail if "No MIG-enabled devices found")
1212+
"sudo nvidia-smi mig -lgi | grep -v 'No MIG-enabled devices found' | grep -q '" + migProfile + "'",
1213+
}
1214+
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "MIG instances with profile "+migProfile+" were not found")
1215+
1216+
stdout := execResult.stdout.String()
1217+
require.Contains(s.T, stdout, migProfile, "expected to find MIG profile %s in output, but did not.\nOutput:\n%s", migProfile, stdout)
1218+
require.NotContains(s.T, stdout, "No MIG-enabled devices found", "no MIG devices were created.\nOutput:\n%s", stdout)
1219+
s.T.Logf("MIG instances with profile %s are created", migProfile)
1220+
}

0 commit comments

Comments
 (0)