@@ -21,6 +21,7 @@ import (
21
21
"os/exec"
22
22
23
23
"github.com/google/go-cmp/cmp/cmpopts"
24
+ kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
24
25
"github.com/onsi/ginkgo/v2"
25
26
"github.com/onsi/gomega"
26
27
batchv1 "k8s.io/api/batch/v1"
@@ -37,9 +38,11 @@ import (
37
38
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
38
39
workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job"
39
40
workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
41
+ workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob"
40
42
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
41
43
testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job"
42
44
testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset"
45
+ testing "sigs.k8s.io/kueue/pkg/util/testingjobs/tfjob"
43
46
"sigs.k8s.io/kueue/pkg/workload"
44
47
"sigs.k8s.io/kueue/test/util"
45
48
)
@@ -364,6 +367,103 @@ var _ = ginkgo.Describe("MultiKueue", func() {
364
367
util .IgnoreConditionTimestampsAndObservedGeneration )))
365
368
})
366
369
})
370
+ ginkgo .It ("Should run a kubeflow tfjob on worker if admitted" , func () {
371
+ tfJob := testing .MakeTFJob ("tfjob1" , managerNs .Name ).
372
+ Queue (managerLq .Name ).
373
+ TFReplicaSpecs (
374
+ testing.TFReplicaSpecRequirement {
375
+ ReplicaType : kftraining .TFJobReplicaTypePS ,
376
+ ReplicaCount : 1 ,
377
+ Annotations : map [string ]string {
378
+ "sidecar.istio.io/inject" : "false" ,
379
+ },
380
+ RestartPolicy : "Never" ,
381
+ Image : "kubeflow/tf-dist-mnist-test:v1-855e096" ,
382
+ },
383
+ testing.TFReplicaSpecRequirement {
384
+ ReplicaType : kftraining .TFJobReplicaTypeWorker ,
385
+ ReplicaCount : 2 ,
386
+ Annotations : map [string ]string {
387
+ "sidecar.istio.io/inject" : "false" ,
388
+ },
389
+ RestartPolicy : "OnFailure" ,
390
+ Image : "kubeflow/tf-dist-mnist-test:v1-855e096" ,
391
+ },
392
+ ).
393
+ Request (kftraining .TFJobReplicaTypePS , corev1 .ResourceCPU , "1" ).
394
+ Request (kftraining .TFJobReplicaTypePS , corev1 .ResourceMemory , "200M" ).
395
+ Request (kftraining .TFJobReplicaTypeWorker , corev1 .ResourceCPU , "0.5" ).
396
+ Request (kftraining .TFJobReplicaTypeWorker , corev1 .ResourceMemory , "100M" ).
397
+ Obj ()
398
+
399
+ ginkgo .By ("Creating the tfJob" , func () {
400
+ gomega .Expect (k8sManagerClient .Create (ctx , tfJob )).Should (gomega .Succeed ())
401
+ })
402
+
403
+ createdLeaderWorkload := & kueue.Workload {}
404
+ wlLookupKey := types.NamespacedName {Name : workloadtfjob .GetWorkloadNameForTFJob (tfJob .Name , tfJob .UID ), Namespace : managerNs .Name }
405
+
406
+ // the execution should be given to the worker
407
+ ginkgo .By ("Waiting to be admitted in worker1 and manager" , func () {
408
+ gomega .Eventually (func (g gomega.Gomega ) {
409
+ g .Expect (k8sManagerClient .Get (ctx , wlLookupKey , createdLeaderWorkload )).To (gomega .Succeed ())
410
+ g .Expect (apimeta .FindStatusCondition (createdLeaderWorkload .Status .Conditions , kueue .WorkloadAdmitted )).To (gomega .BeComparableTo (& metav1.Condition {
411
+ Type : kueue .WorkloadAdmitted ,
412
+ Status : metav1 .ConditionTrue ,
413
+ Reason : "Admitted" ,
414
+ Message : "The workload is admitted" ,
415
+ }, util .IgnoreConditionTimestampsAndObservedGeneration ))
416
+ g .Expect (workload .FindAdmissionCheck (createdLeaderWorkload .Status .AdmissionChecks , multiKueueAc .Name )).To (gomega .BeComparableTo (& kueue.AdmissionCheckState {
417
+ Name : multiKueueAc .Name ,
418
+ State : kueue .CheckStateReady ,
419
+ Message : `The workload got reservation on "worker1"` ,
420
+ }, cmpopts .IgnoreFields (kueue.AdmissionCheckState {}, "LastTransitionTime" )))
421
+ }, util .Timeout , util .Interval ).Should (gomega .Succeed ())
422
+ })
423
+
424
+ ginkgo .By ("Waiting for the tfJob to get status updates" , func () {
425
+ gomega .Eventually (func (g gomega.Gomega ) {
426
+ createdTfJob := & kftraining.TFJob {}
427
+ g .Expect (k8sManagerClient .Get (ctx , client .ObjectKeyFromObject (tfJob ), createdTfJob )).To (gomega .Succeed ())
428
+ g .Expect (createdTfJob .Status .ReplicaStatuses ).To (gomega .BeComparableTo (
429
+ map [kftraining.ReplicaType ]* kftraining.ReplicaStatus {
430
+ kftraining .TFJobReplicaTypePS : {
431
+ Active : 1 ,
432
+ Succeeded : 0 ,
433
+ },
434
+ kftraining .TFJobReplicaTypeWorker : {
435
+ Active : 2 ,
436
+ Succeeded : 0 ,
437
+ },
438
+ },
439
+ util .IgnoreConditionTimestampsAndObservedGeneration ))
440
+ }, 3 * util .LongTimeout , util .Interval ).Should (gomega .Succeed ())
441
+ })
442
+
443
+ ginkgo .By ("Waiting for the tfJob to finish" , func () {
444
+ gomega .Eventually (func (g gomega.Gomega ) {
445
+ g .Expect (k8sManagerClient .Get (ctx , wlLookupKey , createdLeaderWorkload )).To (gomega .Succeed ())
446
+
447
+ g .Expect (apimeta .FindStatusCondition (createdLeaderWorkload .Status .Conditions , kueue .WorkloadFinished )).To (gomega .BeComparableTo (& metav1.Condition {
448
+ Type : kueue .WorkloadFinished ,
449
+ Status : metav1 .ConditionTrue ,
450
+ Reason : kueue .WorkloadFinishedReasonSucceeded ,
451
+ Message : fmt .Sprintf ("TFJob %s/%s successfully completed." , createdLeaderWorkload .Namespace , tfJob .Name ),
452
+ }, util .IgnoreConditionTimestampsAndObservedGeneration ))
453
+ }, 2 * util .LongTimeout , util .Interval ).Should (gomega .Succeed ())
454
+ })
455
+
456
+ ginkgo .By ("Checking no objects are left in the worker clusters and the tfJob is completed" , func () {
457
+ gomega .Eventually (func (g gomega.Gomega ) {
458
+ workerWl := & kueue.Workload {}
459
+ g .Expect (k8sWorker1Client .Get (ctx , wlLookupKey , workerWl )).To (utiltesting .BeNotFoundError ())
460
+ g .Expect (k8sWorker2Client .Get (ctx , wlLookupKey , workerWl )).To (utiltesting .BeNotFoundError ())
461
+ workerTfJob := & kftraining.TFJob {}
462
+ g .Expect (k8sWorker1Client .Get (ctx , client .ObjectKeyFromObject (tfJob ), workerTfJob )).To (utiltesting .BeNotFoundError ())
463
+ g .Expect (k8sWorker2Client .Get (ctx , client .ObjectKeyFromObject (tfJob ), workerTfJob )).To (utiltesting .BeNotFoundError ())
464
+ }, util .Timeout , util .Interval ).Should (gomega .Succeed ())
465
+ })
466
+ })
367
467
})
368
468
ginkgo .When ("The connection to a worker cluster is unreliable" , func () {
369
469
ginkgo .It ("Should update the cluster status to reflect the connection state" , func () {
0 commit comments