Skip to content

Commit 0149eb0

Browse files
akshaychitneniAkshay Chitneni
and
Akshay Chitneni
authored
KEP-2170: Adding CEL validations on v2 TrainJob CRD (#2260)
Signed-off-by: Akshay Chitneni <[email protected]> Co-authored-by: Akshay Chitneni <[email protected]>
1 parent 6965c1a commit 0149eb0

File tree

4 files changed

+111
-3
lines changed

4 files changed

+111
-3
lines changed

manifests/v2/base/crds/kubeflow.org_trainjobs.yaml

+10-1
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ spec:
197197
They will be merged with the TrainingRuntime values.
198198
type: object
199199
managedBy:
200+
default: kubeflow.org/trainjob-controller
200201
description: |-
201202
ManagedBy is used to indicate the controller or entity that manages a TrainJob.
202203
The value must be either an empty, `kubeflow.org/trainjob-controller` or
@@ -206,6 +207,12 @@ spec:
206207
with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.
207208
Defaults to `kubeflow.org/trainjob-controller`
208209
type: string
210+
x-kubernetes-validations:
211+
- message: ManagedBy must be kubeflow.org/trainjob-controller or kueue.x-k8s.io/multikueue
212+
if set
213+
rule: self in ['kubeflow.org/trainjob-controller', 'kueue.x-k8s.io/multikueue']
214+
- message: ManagedBy value is immutable
215+
rule: self == oldSelf
209216
modelConfig:
210217
description: Configuration of the pre-trained and trained model.
211218
properties:
@@ -2736,14 +2743,15 @@ spec:
27362743
description: Reference to the training runtime.
27372744
properties:
27382745
apiGroup:
2746+
default: kubeflow.org
27392747
description: |-
27402748
APIGroup of the runtime being referenced.
27412749
Defaults to `kubeflow.org`.
27422750
type: string
27432751
kind:
2752+
default: ClusterTrainingRuntime
27442753
description: |-
27452754
Kind of the runtime being referenced.
2746-
It must be one of TrainingRuntime or ClusterTrainingRuntime.
27472755
Defaults to ClusterTrainingRuntime.
27482756
type: string
27492757
name:
@@ -2756,6 +2764,7 @@ spec:
27562764
- name
27572765
type: object
27582766
suspend:
2767+
default: false
27592768
description: |-
27602769
Whether the controller should suspend the running TrainJob.
27612770
Defaults to false.

pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ import (
2222
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
2323
)
2424

25+
const (
26+
TrainJobKind string = "TrainJob"
27+
)
28+
2529
// +genclient
2630
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
2731
// +kubebuilder:object:root=true
@@ -87,6 +91,7 @@ type TrainJobSpec struct {
8791

8892
// Whether the controller should suspend the running TrainJob.
8993
// Defaults to false.
94+
// +kubebuilder:default=false
9095
Suspend *bool `json:"suspend,omitempty"`
9196

9297
// ManagedBy is used to indicate the controller or entity that manages a TrainJob.
@@ -96,6 +101,9 @@ type TrainJobSpec struct {
96101
// `kubeflow.org/trainjob-controller`, but delegates reconciling TrainJobs
97102
// with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.
98103
// Defaults to `kubeflow.org/trainjob-controller`
104+
// +kubebuilder:default="kubeflow.org/trainjob-controller"
105+
// +kubebuilder:validation:XValidation:rule="self in ['kubeflow.org/trainjob-controller', 'kueue.x-k8s.io/multikueue']", message="ManagedBy must be kubeflow.org/trainjob-controller or kueue.x-k8s.io/multikueue if set"
106+
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="ManagedBy value is immutable"
99107
ManagedBy *string `json:"managedBy,omitempty"`
100108
}
101109

@@ -108,11 +116,12 @@ type RuntimeRef struct {
108116

109117
// APIGroup of the runtime being referenced.
110118
// Defaults to `kubeflow.org`.
119+
// +kubebuilder:default="kubeflow.org"
111120
APIGroup *string `json:"apiGroup,omitempty"`
112121

113122
// Kind of the runtime being referenced.
114-
// It must be one of TrainingRuntime or ClusterTrainingRuntime.
115123
// Defaults to ClusterTrainingRuntime.
124+
// +kubebuilder:default="ClusterTrainingRuntime"
116125
Kind *string `json:"kind,omitempty"`
117126
}
118127

test/integration/controller.v2/trainjob_controller_test.go

+90
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/onsi/gomega"
2222
corev1 "k8s.io/api/core/v1"
2323
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24+
"k8s.io/utils/ptr"
2425
"sigs.k8s.io/controller-runtime/pkg/client"
2526

2627
kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
@@ -71,4 +72,93 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() {
7172
gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed())
7273
})
7374
})
75+
76+
ginkgo.When("TrainJob CR Validation", func() {
77+
ginkgo.AfterEach(func() {
78+
gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainJob{}, client.InNamespace(ns.Name))).Should(
79+
gomega.Succeed())
80+
})
81+
82+
ginkgo.It("Should succeed in creating TrainJob", func() {
83+
84+
managedBy := "kubeflow.org/trainjob-controller"
85+
86+
trainingRuntimeRef := kubeflowv2.RuntimeRef{
87+
Name: "TorchRuntime",
88+
APIGroup: ptr.To(kubeflowv2.GroupVersion.Group),
89+
Kind: ptr.To(kubeflowv2.TrainingRuntimeKind),
90+
}
91+
jobSpec := kubeflowv2.TrainJobSpec{
92+
RuntimeRef: trainingRuntimeRef,
93+
ManagedBy: &managedBy,
94+
}
95+
trainJob := &kubeflowv2.TrainJob{
96+
TypeMeta: metav1.TypeMeta{
97+
APIVersion: kubeflowv2.SchemeGroupVersion.String(),
98+
Kind: kubeflowv2.TrainJobKind,
99+
},
100+
ObjectMeta: metav1.ObjectMeta{
101+
GenerateName: "valid-trainjob-",
102+
Namespace: ns.Name,
103+
},
104+
Spec: jobSpec,
105+
}
106+
107+
err := k8sClient.Create(ctx, trainJob)
108+
gomega.Expect(err).Should(gomega.Succeed())
109+
})
110+
111+
ginkgo.It("Should fail in creating TrainJob with invalid spec.managedBy", func() {
112+
managedBy := "invalidManagedBy"
113+
jobSpec := kubeflowv2.TrainJobSpec{
114+
ManagedBy: &managedBy,
115+
}
116+
trainJob := &kubeflowv2.TrainJob{
117+
TypeMeta: metav1.TypeMeta{
118+
APIVersion: kubeflowv2.SchemeGroupVersion.String(),
119+
Kind: kubeflowv2.TrainJobKind,
120+
},
121+
ObjectMeta: metav1.ObjectMeta{
122+
Name: "invalid-trainjob",
123+
Namespace: ns.Name,
124+
},
125+
Spec: jobSpec,
126+
}
127+
gomega.Expect(k8sClient.Create(ctx, trainJob)).To(gomega.MatchError(
128+
gomega.ContainSubstring("spec.managedBy: Invalid value")))
129+
})
130+
131+
ginkgo.It("Should fail in updating spec.managedBy", func() {
132+
133+
managedBy := "kubeflow.org/trainjob-controller"
134+
135+
trainingRuntimeRef := kubeflowv2.RuntimeRef{
136+
Name: "TorchRuntime",
137+
APIGroup: ptr.To(kubeflowv2.GroupVersion.Group),
138+
Kind: ptr.To(kubeflowv2.TrainingRuntimeKind),
139+
}
140+
jobSpec := kubeflowv2.TrainJobSpec{
141+
RuntimeRef: trainingRuntimeRef,
142+
ManagedBy: &managedBy,
143+
}
144+
trainJob := &kubeflowv2.TrainJob{
145+
TypeMeta: metav1.TypeMeta{
146+
APIVersion: kubeflowv2.SchemeGroupVersion.String(),
147+
Kind: kubeflowv2.TrainJobKind,
148+
},
149+
ObjectMeta: metav1.ObjectMeta{
150+
Name: "job-with-failed-update",
151+
Namespace: ns.Name,
152+
},
153+
Spec: jobSpec,
154+
}
155+
156+
gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed())
157+
updatedManagedBy := "kueue.x-k8s.io/multikueue"
158+
jobSpec.ManagedBy = &updatedManagedBy
159+
trainJob.Spec = jobSpec
160+
gomega.Expect(k8sClient.Update(ctx, trainJob)).To(gomega.MatchError(
161+
gomega.ContainSubstring("ManagedBy value is immutable")))
162+
})
163+
})
74164
})

0 commit comments

Comments
 (0)