@@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
10511051 s .verifyDeviceConfigStatus (devCfg , c )
10521052 s .verifyNodeGPULabel (devCfg , c )
10531053
1054- ret , err := utils .GetAMDGPUCount (ctx , s .clientSet )
1054+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "gpu" )
10551055 if err != nil {
10561056 logger .Errorf ("error: %v" , err )
10571057 }
@@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
10781078 err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
10791079 assert .NoError (c , err , "failed to deploy pods" )
10801080 s .verifyROCMPOD (true , c )
1081- err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount )
1081+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "gpu" )
10821082 assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
10831083
10841084 // delete
@@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
10921092 assert .NoError (c , err , "failed to reboot nodes" )
10931093}
10941094
1095+ func (s * E2ESuite ) TestWorkloadRequestedGPUsHomogeneousSingle (c * C ) {
1096+ if s .simEnable {
1097+ c .Skip ("Skipping for non amd gpu testbed" )
1098+ }
1099+ if ! dcmImageDefined {
1100+ c .Skip ("skip DCM test because E2E_DCM_IMAGE is not defined" )
1101+ }
1102+
1103+ s .configMapHelper (c )
1104+
1105+ logger .Infof ("Add node label after pod comes up" )
1106+ time .Sleep (30 * time .Second )
1107+
1108+ nodes := utils .GetAMDGpuWorker (s .clientSet , s .openshift )
1109+ nodeNames := make ([]string , 0 )
1110+ for _ , node := range nodes {
1111+ nodeNames = append (nodeNames , node .Name )
1112+ }
1113+ for _ , nodeName := range nodeNames {
1114+ s .addRemoveNodeLabels (nodeName , "e2e_profile2" )
1115+ }
1116+
1117+ logs := s .getLogs ()
1118+ if strings .Contains (logs , "Partition completed successfully" ) && (! strings .Contains (logs , "ERROR" )) && (s .eventHelper ("SuccessfullyPartitioned" , "Normal" )) {
1119+ logger .Infof ("Successfully tested homogenous default partitioning" )
1120+ } else {
1121+ logger .Errorf ("Failure test homogenous partitioning" )
1122+ }
1123+ devCfgDcm := s .getDeviceConfigForDCM (c )
1124+ s .deleteDeviceConfig (devCfgDcm , c )
1125+
1126+ time .Sleep (60 * time .Second )
1127+
1128+ ctx := context .TODO ()
1129+ logger .Infof ("create %v" , s .cfgName )
1130+ devCfg := s .getDeviceConfig (c )
1131+ driverEnable := false
1132+ devCfg .Spec .Driver .Enable = & driverEnable
1133+ s .createDeviceConfig (devCfg , c )
1134+ s .checkNFDWorkerStatus (s .ns , c , "" )
1135+ s .checkNodeLabellerStatus (s .ns , c , devCfg )
1136+ s .verifyDeviceConfigStatus (devCfg , c )
1137+ s .verifyNodeGPULabel (devCfg , c )
1138+
1139+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "gpu" )
1140+ if err != nil {
1141+ logger .Errorf ("error: %v" , err )
1142+ }
1143+ var minGPU int = 10000
1144+ for _ , v := range ret {
1145+ if v < minGPU {
1146+ minGPU = v
1147+ }
1148+ }
1149+ assert .Greater (c , minGPU , 0 , "did not find any server with amd gpu" )
1150+
1151+ gpuLimitCount := minGPU
1152+ gpuReqCount := minGPU
1153+
1154+ res := & v1.ResourceRequirements {
1155+ Limits : v1.ResourceList {
1156+ "amd.com/gpu" : resource .MustParse (fmt .Sprintf ("%d" , gpuLimitCount )),
1157+ },
1158+ Requests : v1.ResourceList {
1159+ "amd.com/gpu" : resource .MustParse (fmt .Sprintf ("%d" , gpuReqCount )),
1160+ },
1161+ }
1162+
1163+ err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1164+ assert .NoError (c , err , "failed to deploy pods" )
1165+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "gpu" )
1166+ assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1167+
1168+ // delete
1169+ s .deleteDeviceConfig (devCfg , c )
1170+
1171+ err = utils .DelRocmPods (context .TODO (), s .clientSet )
1172+ assert .NoError (c , err , "failed to remove rocm pods" )
1173+ }
1174+
1175+ func (s * E2ESuite ) TestWorkloadRequestedGPUsHomogeneousMixed (c * C ) {
1176+ if s .simEnable {
1177+ c .Skip ("Skipping for non amd gpu testbed" )
1178+ }
1179+ if ! dcmImageDefined {
1180+ c .Skip ("skip DCM test because E2E_DCM_IMAGE is not defined" )
1181+ }
1182+
1183+ s .configMapHelper (c )
1184+
1185+ logger .Infof ("Add node label after pod comes up" )
1186+ time .Sleep (30 * time .Second )
1187+
1188+ nodes := utils .GetAMDGpuWorker (s .clientSet , s .openshift )
1189+ nodeNames := make ([]string , 0 )
1190+ for _ , node := range nodes {
1191+ nodeNames = append (nodeNames , node .Name )
1192+ }
1193+ for _ , nodeName := range nodeNames {
1194+ s .addRemoveNodeLabels (nodeName , "e2e_profile2" )
1195+ }
1196+
1197+ logs := s .getLogs ()
1198+ if strings .Contains (logs , "Partition completed successfully" ) && (! strings .Contains (logs , "ERROR" )) && (s .eventHelper ("SuccessfullyPartitioned" , "Normal" )) {
1199+ logger .Infof ("Successfully tested homogeneous partitioning" )
1200+ } else {
1201+ logger .Errorf ("Failure test homogeneous partitioning" )
1202+ }
1203+ devCfgDcm := s .getDeviceConfigForDCM (c )
1204+ s .deleteDeviceConfig (devCfgDcm , c )
1205+ time .Sleep (60 * time .Second )
1206+ ctx := context .TODO ()
1207+ logger .Infof ("create %v" , s .cfgName )
1208+ devCfg := s .getDeviceConfig (c )
1209+ driverEnable := false
1210+ devCfg .Spec .Driver .Enable = & driverEnable
1211+ devCfg .Spec .DevicePlugin .DevicePluginArguments = map [string ]string {"resource_naming_strategy" : "mixed" }
1212+ s .createDeviceConfig (devCfg , c )
1213+ s .checkNFDWorkerStatus (s .ns , c , "" )
1214+ s .checkNodeLabellerStatus (s .ns , c , devCfg )
1215+ s .verifyDeviceConfigStatus (devCfg , c )
1216+
1217+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "cpx_nps4" )
1218+ if err != nil {
1219+ logger .Errorf ("error: %v" , err )
1220+ }
1221+ var minGPU int = 10000
1222+ for _ , v := range ret {
1223+ if v < minGPU {
1224+ minGPU = v
1225+ }
1226+ }
1227+ assert .Greater (c , minGPU , 0 , "did not find any server with amd gpu" )
1228+
1229+ gpuLimitCount := minGPU
1230+ gpuReqCount := minGPU
1231+
1232+ res := & v1.ResourceRequirements {
1233+ Limits : v1.ResourceList {
1234+ "amd.com/cpx_nps4" : resource .MustParse (fmt .Sprintf ("%d" , gpuLimitCount )),
1235+ },
1236+ Requests : v1.ResourceList {
1237+ "amd.com/cpx_nps4" : resource .MustParse (fmt .Sprintf ("%d" , gpuReqCount )),
1238+ },
1239+ }
1240+
1241+ err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1242+ assert .NoError (c , err , "failed to deploy pods" )
1243+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "cpx_nps4" )
1244+ assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1245+
1246+ // delete
1247+ s .deleteDeviceConfig (devCfg , c )
1248+
1249+ err = utils .DelRocmPods (context .TODO (), s .clientSet )
1250+ assert .NoError (c , err , "failed to remove rocm pods" )
1251+
1252+ }
1253+
1254+ func (s * E2ESuite ) TestWorkloadRequestedGPUsHeterogeneousMixed (c * C ) {
1255+ if s .simEnable {
1256+ c .Skip ("Skipping for non amd gpu testbed" )
1257+ }
1258+ if ! dcmImageDefined {
1259+ c .Skip ("skip DCM test because E2E_DCM_IMAGE is not defined" )
1260+ }
1261+
1262+ s .configMapHelper (c )
1263+
1264+ logger .Infof ("Add node label after pod comes up" )
1265+ time .Sleep (30 * time .Second )
1266+
1267+ nodes := utils .GetAMDGpuWorker (s .clientSet , s .openshift )
1268+ nodeNames := make ([]string , 0 )
1269+ for _ , node := range nodes {
1270+ nodeNames = append (nodeNames , node .Name )
1271+ }
1272+ for _ , nodeName := range nodeNames {
1273+ s .addRemoveNodeLabels (nodeName , "e2e_profile1" )
1274+ }
1275+
1276+ logs := s .getLogs ()
1277+ if strings .Contains (logs , "Partition completed successfully" ) && (! strings .Contains (logs , "ERROR" )) && (s .eventHelper ("SuccessfullyPartitioned" , "Normal" )) {
1278+ logger .Infof ("Successfully tested homogeneous partitioning" )
1279+ } else {
1280+ logger .Errorf ("Failure test heterogenous partitioning" )
1281+ }
1282+ devCfgDcm := s .getDeviceConfigForDCM (c )
1283+ s .deleteDeviceConfig (devCfgDcm , c )
1284+ time .Sleep (60 * time .Second )
1285+
1286+ ctx := context .TODO ()
1287+ logger .Infof ("create %v" , s .cfgName )
1288+ devCfg := s .getDeviceConfig (c )
1289+ driverEnable := false
1290+ devCfg .Spec .Driver .Enable = & driverEnable
1291+ devCfg .Spec .DevicePlugin .DevicePluginArguments = map [string ]string {"resource_naming_strategy" : "mixed" }
1292+ s .createDeviceConfig (devCfg , c )
1293+ s .checkNFDWorkerStatus (s .ns , c , "" )
1294+ s .checkNodeLabellerStatus (s .ns , c , devCfg )
1295+ s .verifyDeviceConfigStatus (devCfg , c )
1296+
1297+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "cpx_nps1" )
1298+ if err != nil {
1299+ logger .Errorf ("error: %v" , err )
1300+ }
1301+ var minGPU int = 10000
1302+ for _ , v := range ret {
1303+ if v < minGPU {
1304+ minGPU = v
1305+ }
1306+ }
1307+ assert .Greater (c , minGPU , 0 , "did not find any server with amd gpu" )
1308+
1309+ gpuLimitCount := minGPU
1310+ gpuReqCount := minGPU
1311+
1312+ res := & v1.ResourceRequirements {
1313+ Limits : v1.ResourceList {
1314+ "amd.com/cpx_nps1" : resource .MustParse (fmt .Sprintf ("%d" , gpuLimitCount )),
1315+ },
1316+ Requests : v1.ResourceList {
1317+ "amd.com/cpx_nps1" : resource .MustParse (fmt .Sprintf ("%d" , gpuReqCount )),
1318+ },
1319+ }
1320+
1321+ err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1322+ assert .NoError (c , err , "failed to deploy pods" )
1323+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "cpx_nps1" )
1324+ assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1325+
1326+ // delete
1327+ s .deleteDeviceConfig (devCfg , c )
1328+
1329+ err = utils .DelRocmPods (context .TODO (), s .clientSet )
1330+ assert .NoError (c , err , "failed to remove rocm pods" )
1331+ }
1332+
10951333func (s * E2ESuite ) TestKubeRbacProxyClusterIP (c * C ) {
10961334 _ , err := s .dClient .DeviceConfigs (s .ns ).Get ("deviceconfig-kuberbac-clusterip" , metav1.GetOptions {})
10971335 assert .Errorf (c , err , "config deviceconfig-kuberbac-clusterip exists" )
0 commit comments