@@ -21,6 +21,7 @@ import (
21
21
"os/exec"
22
22
23
23
"github.com/google/go-cmp/cmp/cmpopts"
24
+ kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
24
25
"github.com/onsi/ginkgo/v2"
25
26
"github.com/onsi/gomega"
26
27
batchv1 "k8s.io/api/batch/v1"
@@ -37,9 +38,11 @@ import (
37
38
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
38
39
workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job"
39
40
workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
41
+ workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob"
40
42
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
41
43
testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job"
42
44
testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset"
45
+ testingtfjob "sigs.k8s.io/kueue/pkg/util/testingjobs/tfjob"
43
46
"sigs.k8s.io/kueue/pkg/workload"
44
47
"sigs.k8s.io/kueue/test/util"
45
48
)
@@ -364,6 +367,118 @@ var _ = ginkgo.Describe("MultiKueue", func() {
364
367
util .IgnoreConditionTimestampsAndObservedGeneration )))
365
368
})
366
369
})
370
+ ginkgo .It ("Should run a kubeflow TFJob on worker if admitted" , func () {
371
+ tfJob := testingtfjob .MakeTFJob ("tfjob1" , managerNs .Name ).
372
+ Queue (managerLq .Name ).
373
+ TFReplicaSpecs (
374
+ testingtfjob.TFReplicaSpecRequirement {
375
+ ReplicaType : kftraining .TFJobReplicaTypeChief ,
376
+ ReplicaCount : 1 ,
377
+ Annotations : map [string ]string {
378
+ "sidecar.istio.io/inject" : "false" ,
379
+ },
380
+ RestartPolicy : "OnFailure" ,
381
+ },
382
+ testingtfjob.TFReplicaSpecRequirement {
383
+ ReplicaType : kftraining .TFJobReplicaTypePS ,
384
+ ReplicaCount : 1 ,
385
+ Annotations : map [string ]string {
386
+ "sidecar.istio.io/inject" : "false" ,
387
+ },
388
+ RestartPolicy : "Never" ,
389
+ },
390
+ testingtfjob.TFReplicaSpecRequirement {
391
+ ReplicaType : kftraining .TFJobReplicaTypeWorker ,
392
+ ReplicaCount : 2 ,
393
+ Annotations : map [string ]string {
394
+ "sidecar.istio.io/inject" : "false" ,
395
+ },
396
+ RestartPolicy : "OnFailure" ,
397
+ },
398
+ ).
399
+ Request (kftraining .TFJobReplicaTypePS , corev1 .ResourceCPU , "0.5" ).
400
+ Request (kftraining .TFJobReplicaTypePS , corev1 .ResourceMemory , "200M" ).
401
+ Request (kftraining .TFJobReplicaTypePS , corev1 .ResourceCPU , "0.5" ).
402
+ Request (kftraining .TFJobReplicaTypePS , corev1 .ResourceMemory , "200M" ).
403
+ Request (kftraining .TFJobReplicaTypeWorker , corev1 .ResourceCPU , "0.5" ).
404
+ Request (kftraining .TFJobReplicaTypeWorker , corev1 .ResourceMemory , "100M" ).
405
+ Image (kftraining .TFJobReplicaTypeChief , "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0" , []string {"5s" }).
406
+ Image (kftraining .TFJobReplicaTypePS , "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0" , []string {"5s" }).
407
+ Image (kftraining .TFJobReplicaTypeWorker , "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0" , []string {"5s" }).
408
+ Obj ()
409
+
410
+ ginkgo .By ("Creating the TfJob" , func () {
411
+ gomega .Expect (k8sManagerClient .Create (ctx , tfJob )).Should (gomega .Succeed ())
412
+ })
413
+
414
+ createdLeaderWorkload := & kueue.Workload {}
415
+ wlLookupKey := types.NamespacedName {Name : workloadtfjob .GetWorkloadNameForTFJob (tfJob .Name , tfJob .UID ), Namespace : managerNs .Name }
416
+
417
+ // the execution should be given to the worker
418
+ ginkgo .By ("Waiting to be admitted in worker1 and manager" , func () {
419
+ gomega .Eventually (func (g gomega.Gomega ) {
420
+ g .Expect (k8sManagerClient .Get (ctx , wlLookupKey , createdLeaderWorkload )).To (gomega .Succeed ())
421
+ g .Expect (apimeta .FindStatusCondition (createdLeaderWorkload .Status .Conditions , kueue .WorkloadAdmitted )).To (gomega .BeComparableTo (& metav1.Condition {
422
+ Type : kueue .WorkloadAdmitted ,
423
+ Status : metav1 .ConditionTrue ,
424
+ Reason : "Admitted" ,
425
+ Message : "The workload is admitted" ,
426
+ }, util .IgnoreConditionTimestampsAndObservedGeneration ))
427
+ g .Expect (workload .FindAdmissionCheck (createdLeaderWorkload .Status .AdmissionChecks , multiKueueAc .Name )).To (gomega .BeComparableTo (& kueue.AdmissionCheckState {
428
+ Name : multiKueueAc .Name ,
429
+ State : kueue .CheckStateReady ,
430
+ Message : `The workload got reservation on "worker1"` ,
431
+ }, cmpopts .IgnoreFields (kueue.AdmissionCheckState {}, "LastTransitionTime" )))
432
+ }, util .Timeout , util .Interval ).Should (gomega .Succeed ())
433
+ })
434
+
435
+ ginkgo .By ("Waiting for the TfJob to get status updates" , func () {
436
+ gomega .Eventually (func (g gomega.Gomega ) {
437
+ createdTfJob := & kftraining.TFJob {}
438
+ g .Expect (k8sManagerClient .Get (ctx , client .ObjectKeyFromObject (tfJob ), createdTfJob )).To (gomega .Succeed ())
439
+ g .Expect (createdTfJob .Status .ReplicaStatuses ).To (gomega .BeComparableTo (
440
+ map [kftraining.ReplicaType ]* kftraining.ReplicaStatus {
441
+ kftraining .TFJobReplicaTypeChief : {
442
+ Active : 1 ,
443
+ Succeeded : 0 ,
444
+ },
445
+ kftraining .TFJobReplicaTypePS : {
446
+ Active : 1 ,
447
+ Succeeded : 0 ,
448
+ },
449
+ kftraining .TFJobReplicaTypeWorker : {
450
+ Active : 2 ,
451
+ Succeeded : 0 ,
452
+ },
453
+ },
454
+ util .IgnoreConditionTimestampsAndObservedGeneration ))
455
+ }, util .LongTimeout , util .Interval ).Should (gomega .Succeed ())
456
+ })
457
+
458
+ ginkgo .By ("Waiting for the TfJob to finish" , func () {
459
+ gomega .Eventually (func (g gomega.Gomega ) {
460
+ g .Expect (k8sManagerClient .Get (ctx , wlLookupKey , createdLeaderWorkload )).To (gomega .Succeed ())
461
+
462
+ g .Expect (apimeta .FindStatusCondition (createdLeaderWorkload .Status .Conditions , kueue .WorkloadFinished )).To (gomega .BeComparableTo (& metav1.Condition {
463
+ Type : kueue .WorkloadFinished ,
464
+ Status : metav1 .ConditionTrue ,
465
+ Reason : kueue .WorkloadFinishedReasonSucceeded ,
466
+ Message : fmt .Sprintf ("TFJob %s/%s successfully completed." , createdLeaderWorkload .Namespace , tfJob .Name ),
467
+ }, util .IgnoreConditionTimestampsAndObservedGeneration ))
468
+ }, util .LongTimeout , util .Interval ).Should (gomega .Succeed ())
469
+ })
470
+
471
+ ginkgo .By ("Checking no objects are left in the worker clusters and the TfJob is completed" , func () {
472
+ gomega .Eventually (func (g gomega.Gomega ) {
473
+ workerWl := & kueue.Workload {}
474
+ g .Expect (k8sWorker1Client .Get (ctx , wlLookupKey , workerWl )).To (utiltesting .BeNotFoundError ())
475
+ g .Expect (k8sWorker2Client .Get (ctx , wlLookupKey , workerWl )).To (utiltesting .BeNotFoundError ())
476
+ workerTfJob := & kftraining.TFJob {}
477
+ g .Expect (k8sWorker1Client .Get (ctx , client .ObjectKeyFromObject (tfJob ), workerTfJob )).To (utiltesting .BeNotFoundError ())
478
+ g .Expect (k8sWorker2Client .Get (ctx , client .ObjectKeyFromObject (tfJob ), workerTfJob )).To (utiltesting .BeNotFoundError ())
479
+ }, util .Timeout , util .Interval ).Should (gomega .Succeed ())
480
+ })
481
+ })
367
482
})
368
483
ginkgo .When ("The connection to a worker cluster is unreliable" , func () {
369
484
ginkgo .It ("Should update the cluster status to reflect the connection state" , func () {
0 commit comments