From b6f5ec0003a00332a7c91b48095d42f978c68fc6 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 2 Oct 2025 04:43:39 +0000 Subject: [PATCH 1/6] Add top-level Labels and Resources fields Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 2 + .../crds/ray.io_rayclusters.yaml | 24 +++ .../kuberay-operator/crds/ray.io_rayjobs.yaml | 24 +++ .../crds/ray.io_rayservices.yaml | 24 +++ ray-operator/apis/ray/v1/raycluster_types.go | 14 ++ .../apis/ray/v1/zz_generated.deepcopy.go | 28 +++ .../config/crd/bases/ray.io_rayclusters.yaml | 24 +++ .../config/crd/bases/ray.io_rayjobs.yaml | 24 +++ .../config/crd/bases/ray.io_rayservices.yaml | 24 +++ ray-operator/controllers/ray/common/pod.go | 86 ++++++++- .../controllers/ray/common/pod_test.go | 166 ++++++++++++++++++ ray-operator/controllers/ray/utils/util.go | 38 +++- .../controllers/ray/utils/util_test.go | 106 ++++++++++- .../ray/v1/headgroupspec.go | 24 +++ .../ray/v1/workergroupspec.go | 49 ++++-- 15 files changed, 629 insertions(+), 28 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 4b495fef69e..bf3dbef74a2 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -138,6 +138,7 @@ _Appears in:_ | `template` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | Template is the exact pod template used in K8s deployments, statefulsets, etc. | | | | `headService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | HeadService is the Kubernetes service of the head pod. | | | | `enableIngress` _boolean_ | EnableIngress indicates whether operator should create ingress object for head service or not. | | | +| `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for the head group.
These labels will also be added to the Pods of this head group. | | | | `rayStartParams` _object (keys:string, values:string)_ | RayStartParams are the params of the start command: node-manager-port, object-store-memory, ... | | | | `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | | @@ -417,6 +418,7 @@ _Appears in:_ | `minReplicas` _integer_ | MinReplicas denotes the minimum number of desired Pods for this worker group. | 0 | | | `maxReplicas` _integer_ | MaxReplicas denotes the maximum number of desired Pods for this worker group, and the default value is maxInt32. | 2147483647 | | | `idleTimeoutSeconds` _integer_ | IdleTimeoutSeconds denotes the number of seconds to wait before the v2 autoscaler terminates an idle worker pod of this type.
This value is only used with the Ray Autoscaler enabled and defaults to the value set by the AutoscalingConfig if not specified for this worker group. | | | +| `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for this worker group.
These labels will also be added to the Pods of this worker group. | | | | `rayStartParams` _object (keys:string, values:string)_ | RayStartParams are the params of the start command: address, object-store-memory, ... | | | | `template` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | Template is a pod template for the worker | | | | `scaleStrategy` _[ScaleStrategy](#scalestrategy)_ | ScaleStrategy defines which pods to remove | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml index a8e1f1df8f1..bb6fe99d1a6 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml @@ -631,10 +631,22 @@ spec: type: object type: object type: object + labels: + additionalProperties: + type: string + type: object rayStartParams: additionalProperties: type: string type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object serviceType: type: string template: @@ -4332,6 +4344,10 @@ spec: idleTimeoutSeconds: format: int32 type: integer + labels: + additionalProperties: + type: string + type: object maxReplicas: default: 2147483647 format: int32 @@ -4352,6 +4368,14 @@ spec: default: 0 format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object scaleStrategy: properties: workersToDelete: diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index 8f8679ca607..4d431edf230 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -681,10 +681,22 @@ spec: type: object type: object type: object + labels: + additionalProperties: + type: string + type: object rayStartParams: additionalProperties: type: string type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object serviceType: type: string template: @@ -4382,6 +4394,10 @@ spec: idleTimeoutSeconds: format: int32 type: integer + labels: + additionalProperties: + type: string + type: object maxReplicas: default: 2147483647 format: int32 @@ -4402,6 +4418,14 @@ spec: default: 0 format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object scaleStrategy: properties: workersToDelete: diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index a86457fac1a..8c74a4edfaf 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -611,10 +611,22 @@ spec: type: object type: object type: object + labels: + additionalProperties: + type: string + type: object rayStartParams: additionalProperties: type: string type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object serviceType: type: string template: @@ -4312,6 +4324,10 @@ spec: idleTimeoutSeconds: format: int32 type: integer + labels: + additionalProperties: + type: string + type: object maxReplicas: default: 2147483647 format: int32 @@ -4332,6 +4348,14 @@ spec: default: 0 format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object scaleStrategy: properties: workersToDelete: diff --git a/ray-operator/apis/ray/v1/raycluster_types.go b/ray-operator/apis/ray/v1/raycluster_types.go index 4ac12c79725..f7e944718ac 100644 --- a/ray-operator/apis/ray/v1/raycluster_types.go +++ b/ray-operator/apis/ray/v1/raycluster_types.go @@ -75,6 +75,13 @@ type HeadGroupSpec struct { // EnableIngress indicates whether operator should create ingress object for head service or not. // +optional EnableIngress *bool `json:"enableIngress,omitempty"` + // Resources specifies the resource quantities for the head group. + // +optional + Resources corev1.ResourceList `json:"resources,omitempty"` + // Labels specifies the Ray node labels for the head group. + // These labels will also be added to the Pods of this head group. + // +optional + Labels map[string]string `json:"labels,omitempty"` // RayStartParams are the params of the start command: node-manager-port, object-store-memory, ... // +optional RayStartParams map[string]string `json:"rayStartParams"` @@ -106,6 +113,13 @@ type WorkerGroupSpec struct { // This value is only used with the Ray Autoscaler enabled and defaults to the value set by the AutoscalingConfig if not specified for this worker group. // +optional IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` + // Resources specifies the resource quantities for this worker group. + // +optional + Resources corev1.ResourceList `json:"resources,omitempty"` + // Labels specifies the Ray node labels for this worker group. + // These labels will also be added to the Pods of this worker group. + // +optional + Labels map[string]string `json:"labels,omitempty"` // RayStartParams are the params of the start command: address, object-store-memory, ... // +optional RayStartParams map[string]string `json:"rayStartParams"` diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index b4cb5decf12..7a60f940887 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -179,6 +179,20 @@ func (in *HeadGroupSpec) DeepCopyInto(out *HeadGroupSpec) { *out = new(bool) **out = **in } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = make(corev1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.RayStartParams != nil { in, out := &in.RayStartParams, &out.RayStartParams *out = make(map[string]string, len(*in)) @@ -827,6 +841,20 @@ func (in *WorkerGroupSpec) DeepCopyInto(out *WorkerGroupSpec) { *out = new(int32) **out = **in } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = make(corev1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.RayStartParams != nil { in, out := &in.RayStartParams, &out.RayStartParams *out = make(map[string]string, len(*in)) diff --git a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml index a8e1f1df8f1..bb6fe99d1a6 100644 --- a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml @@ -631,10 +631,22 @@ spec: type: object type: object type: object + labels: + additionalProperties: + type: string + type: object rayStartParams: additionalProperties: type: string type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object serviceType: type: string template: @@ -4332,6 +4344,10 @@ spec: idleTimeoutSeconds: format: int32 type: integer + labels: + additionalProperties: + type: string + type: object maxReplicas: default: 2147483647 format: int32 @@ -4352,6 +4368,14 @@ spec: default: 0 format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object scaleStrategy: properties: workersToDelete: diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index 8f8679ca607..4d431edf230 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -681,10 +681,22 @@ spec: type: object type: object type: object + labels: + additionalProperties: + type: string + type: object rayStartParams: additionalProperties: type: string type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object serviceType: type: string template: @@ -4382,6 +4394,10 @@ spec: idleTimeoutSeconds: format: int32 type: integer + labels: + additionalProperties: + type: string + type: object maxReplicas: default: 2147483647 format: int32 @@ -4402,6 +4418,14 @@ spec: default: 0 format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object scaleStrategy: properties: workersToDelete: diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index a86457fac1a..8c74a4edfaf 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -611,10 +611,22 @@ spec: type: object type: object type: object + labels: + additionalProperties: + type: string + type: object rayStartParams: additionalProperties: type: string type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object serviceType: type: string template: @@ -4312,6 +4324,10 @@ spec: idleTimeoutSeconds: format: int32 type: integer + labels: + additionalProperties: + type: string + type: object maxReplicas: default: 2147483647 format: int32 @@ -4332,6 +4348,14 @@ spec: default: 0 format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object scaleStrategy: properties: workersToDelete: diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 2b5aae59e3e..8d4cf5ea9e2 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -172,10 +172,16 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head // This ensures privilege of KubeRay users are contained within the namespace of the RayCluster. podTemplate.ObjectMeta.Namespace = instance.Namespace - if podTemplate.Labels == nil { - podTemplate.Labels = make(map[string]string) - } - podTemplate.Labels = labelPod(rayv1.HeadNode, instance.Name, utils.RayNodeHeadGroupLabelValue, instance.Spec.HeadGroupSpec.Template.ObjectMeta.Labels) + // Reconcile top-level Resources for head group with rayStartParams. + reconcileRayStartParamsResources(headSpec.RayStartParams, headSpec.Resources) + + // Reconcile top-level Labels for head group with `--labels` in rayStartParams. + reconcileRayStartParamsLabels(headSpec.RayStartParams, headSpec.Labels) + + // Merge K8s labels from the Pod template and the top-level `Labels` field. + mergedLabels := mergeLabels(headSpec.Template.ObjectMeta.Labels, headSpec.Labels) + podTemplate.Labels = labelPod(rayv1.HeadNode, instance.Name, utils.RayNodeHeadGroupLabelValue, mergedLabels) + headSpec.RayStartParams = setMissingRayStartParams(ctx, headSpec.RayStartParams, rayv1.HeadNode, headPort, "") initTemplateAnnotations(instance, &podTemplate) @@ -311,10 +317,17 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo // If the replica of workers is more than 1, `ObjectMeta.Name` may cause name conflict errors. // Hence, we set `ObjectMeta.Name` to an empty string, and use GenerateName to prevent name conflicts. podTemplate.ObjectMeta.Name = "" - if podTemplate.Labels == nil { - podTemplate.Labels = make(map[string]string) - } - podTemplate.Labels = labelPod(rayv1.WorkerNode, instance.Name, workerSpec.GroupName, workerSpec.Template.ObjectMeta.Labels) + + // Reconcile top-level Resources for worker group with rayStartParams. + reconcileRayStartParamsResources(workerSpec.RayStartParams, workerSpec.Resources) + + // Reconcile top-level Labels for worker group with `--labels` in rayStartParams. + reconcileRayStartParamsLabels(workerSpec.RayStartParams, workerSpec.Labels) + + // Merge K8s labels from the Pod template and the top-level `Labels` field. + mergedLabels := mergeLabels(workerSpec.Template.ObjectMeta.Labels, workerSpec.Labels) + podTemplate.Labels = labelPod(rayv1.WorkerNode, instance.Name, workerSpec.GroupName, mergedLabels) + workerSpec.RayStartParams = setMissingRayStartParams(ctx, workerSpec.RayStartParams, rayv1.WorkerNode, headPort, fqdnRayIP) initTemplateAnnotations(instance, &podTemplate) @@ -1026,3 +1039,60 @@ func findMemoryReqOrLimit(container corev1.Container) (res *resource.Quantity) { } return nil } + +// reconcileRayStartParamsLabels reconciles `--labels` in rayStartParams based on group `Labels`. +func reconcileRayStartParamsLabels(rayStartParams map[string]string, groupLabels map[string]string) { + if len(groupLabels) > 0 { + var labels []string + // Sort label keys for deterministic output. + keys := make([]string, 0, len(groupLabels)) + for k := range groupLabels { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + labels = append(labels, fmt.Sprintf("%s=%s", k, groupLabels[k])) + } + // When provided this will override any `--labels` value specified in rayStartParams. + rayStartParams["labels"] = strings.Join(labels, ",") + } +} + +// reconcileRayStartParamsResources reconciles rayStartParams based on the top-level `Resources` field. +func reconcileRayStartParamsResources(rayStartParams map[string]string, topLevelResources corev1.ResourceList) { + if len(topLevelResources) > 0 { + // Override relevant rayStartParams fields to ensure consistency. + rayResourcesJson := make(map[string]float64) + for name, quantity := range topLevelResources { + strName := string(name) + if name == corev1.ResourceCPU { + rayStartParams["num-cpus"] = strconv.FormatInt(quantity.Value(), 10) + } else if name == corev1.ResourceMemory { + rayStartParams["memory"] = strconv.FormatInt(quantity.Value(), 10) + } else if utils.IsGPUResourceKey(strName) { + rayStartParams["num-gpus"] = strconv.FormatInt(quantity.Value(), 10) + } else { + rayResourcesJson[strName] = quantity.AsApproximateFloat64() + } + } + + if len(rayResourcesJson) > 0 { + jsonBytes, _ := json.Marshal(rayResourcesJson) + rayStartParams["resources"] = fmt.Sprintf("'%s'", string(jsonBytes)) + } + } +} + +// mergeLabels combines labels from a pod template and a top-level `labels` spec, +// with the top-level labels field taking precedence. +func mergeLabels(templateLabels map[string]string, topLevelLabels map[string]string) map[string]string { + merged := make(map[string]string) + for k, v := range templateLabels { + merged[k] = v + } + for k, v := range topLevelLabels { + merged[k] = v + } + return merged +} diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index 907b5326efc..12c79c363e4 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1888,3 +1888,169 @@ func TestSetAutoscalerV2EnvVars(t *testing.T) { }) } } + +func TestMergeLabels(t *testing.T) { + tests := map[string]struct { + templateSpecLabels map[string]string + workerGroupLabels map[string]string + expectedLabels map[string]string + }{ + "Non-overlapping labels don't override.": { + templateSpecLabels: map[string]string{"pod-label-key": "pod-label-value"}, + workerGroupLabels: map[string]string{"ray/io:some-label": "ray-node-label-value"}, + expectedLabels: map[string]string{"pod-label-key": "pod-label-value", "ray/io:some-label": "ray-node-label-value"}, + }, + "Overlapping labels override with precedence for group `Labels`.": { + templateSpecLabels: map[string]string{"accelerator-type": "GPU", "market-type": "spot"}, + workerGroupLabels: map[string]string{"accelerator-type": "TPU-V6E"}, + expectedLabels: map[string]string{"accelerator-type": "TPU-V6E", "market-type": "spot"}, + }, + "Empty Pod Spec labels.": { + templateSpecLabels: map[string]string{}, + workerGroupLabels: map[string]string{"group-labels": "group-label-value"}, + expectedLabels: map[string]string{"group-labels": "group-label-value"}, + }, + "Empty `Labels` field for group.": { + templateSpecLabels: map[string]string{"pod-label": "pod-label-value"}, + workerGroupLabels: map[string]string{}, + expectedLabels: map[string]string{"pod-label": "pod-label-value"}, + }, + "Both `Labels` and labels in Pod spec nil.": { + templateSpecLabels: nil, + workerGroupLabels: nil, + expectedLabels: map[string]string{}, + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + merged := mergeLabels(tc.templateSpecLabels, tc.workerGroupLabels) + assert.Equal(t, tc.expectedLabels, merged) + }) + } +} + +func TestReconcileRayStartParamsLabels(t *testing.T) { + tests := map[string]struct { + initialRayStartParams map[string]string + groupLabels map[string]string + expectedRayStartParams map[string]string + }{ + "Set labels in group `Labels` to `--labels` empty rayStartParams.": { + initialRayStartParams: map[string]string{}, + groupLabels: map[string]string{ + "topology.kubernetes.io/zone": "us-central2", + "ray.io/node-group": "worker-group-1", + "cloud.google.com/gke-spot": "true", + }, + // The output string is sorted alphabetically by key. + expectedRayStartParams: map[string]string{ + "labels": "cloud.google.com/gke-spot=true,ray.io/node-group=worker-group-1,topology.kubernetes.io/zone=us-central2", + }, + }, + " group `Labels overwrites an existing '--labels' parameter in rayStartParams": { + initialRayStartParams: map[string]string{ + "labels": "old=label,to-be=replaced", + "resources": "some-resources", // should be retained + }, + groupLabels: map[string]string{ + "new": "label", + }, + expectedRayStartParams: map[string]string{ + "labels": "new=label", + "resources": "some-resources", + }, + }, + "No-op when group `Labels` is nil": { + initialRayStartParams: map[string]string{"labels": "some=labels"}, + groupLabels: nil, + expectedRayStartParams: map[string]string{"labels": "some=labels"}, + }, + "No-op when group `Labels` is empty": { + initialRayStartParams: map[string]string{"labels": "some=labels"}, + groupLabels: map[string]string{}, + expectedRayStartParams: map[string]string{"labels": "some=labels"}, + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + // copy of rayStartParams for test + rayStartParams := make(map[string]string) + for k, v := range tc.initialRayStartParams { + rayStartParams[k] = v + } + + reconcileRayStartParamsLabels(rayStartParams, tc.groupLabels) + + assert.Equal(t, tc.expectedRayStartParams, rayStartParams) + }) + } +} + +func TestReconcileRayStartParamsResources(t *testing.T) { + tests := map[string]struct { + initialRayStartParams map[string]string + groupResources corev1.ResourceList + expectedRayStartParams map[string]string + expectedK8sResources corev1.ResourceList + }{ + "No-op when group `Resources` is nil or empty": { + initialRayStartParams: map[string]string{"existing": "true"}, + groupResources: nil, + expectedRayStartParams: map[string]string{"existing": "true"}, + }, + "Basic CPU and Memory set in `Resources` override rayStartParams": { + initialRayStartParams: map[string]string{}, + groupResources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + }, + expectedRayStartParams: map[string]string{ + "num-cpus": "2", + "memory": "4294967296", // 4Gi in bytes + }, + }, + "GPU and custom TPU resource set in `Resources` override rayStartParams": { + initialRayStartParams: map[string]string{}, + groupResources: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + "TPU": resource.MustParse("4"), + }, + expectedRayStartParams: map[string]string{ + "num-gpus": "1", + "resources": "'{\"TPU\":4}'", + }, + }, + "Top-level `Resources` should override existing resource params": { + initialRayStartParams: map[string]string{ + "num-cpus": "1", + "memory": "1000", + "resources": "'{\"Custom-Resource\": 10}'", + }, + groupResources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + "Custom-Resource": resource.MustParse("5"), + }, + expectedRayStartParams: map[string]string{ + "num-cpus": "4", + "memory": "1000", // preserved + "resources": "'{\"Custom-Resource\":5}'", + }, + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + rayStartParams := make(map[string]string) + for k, v := range tc.initialRayStartParams { + rayStartParams[k] = v + } + + reconcileRayStartParamsResources(rayStartParams, tc.groupResources) + + // Verify rayStartParams are updated based on the top-level Resources values. + assert.Equal(t, tc.expectedRayStartParams, rayStartParams) + }) + } +} diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index cf6b9066323..79aa4dcca99 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -428,13 +428,28 @@ func CalculateAvailableReplicas(pods corev1.PodList) int32 { func CalculateDesiredResources(cluster *rayv1.RayCluster) corev1.ResourceList { desiredResourcesList := []corev1.ResourceList{{}} - headPodResource := CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) + var headPodResource corev1.ResourceList + // Prioritize the top-level head group `Resources` field if specified. + if len(cluster.Spec.HeadGroupSpec.Resources) > 0 { + headPodResource = cluster.Spec.HeadGroupSpec.Resources + } else { + // Otherwise, calculate resources from the pod template spec. + headPodResource = CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) + } desiredResourcesList = append(desiredResourcesList, headPodResource) + for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs { if nodeGroup.Suspend != nil && *nodeGroup.Suspend { continue } - podResource := CalculatePodResource(nodeGroup.Template.Spec) + // Prioritize the top-level worker group `Resources` field if specified. + var podResource corev1.ResourceList + if len(nodeGroup.Resources) > 0 { + podResource = nodeGroup.Resources.DeepCopy() + } else { + // Otherwise, calculate resources from the pod template spec. + podResource = CalculatePodResource(nodeGroup.Template.Spec) + } calculateReplicaResource(&podResource, nodeGroup.NumOfHosts) for i := int32(0); i < *nodeGroup.Replicas; i++ { desiredResourcesList = append(desiredResourcesList, podResource) @@ -445,10 +460,25 @@ func CalculateDesiredResources(cluster *rayv1.RayCluster) corev1.ResourceList { func CalculateMinResources(cluster *rayv1.RayCluster) corev1.ResourceList { minResourcesList := []corev1.ResourceList{{}} - headPodResource := CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) + var headPodResource corev1.ResourceList + // Prioritize the top-level head group `Resources` field if specified. + if len(cluster.Spec.HeadGroupSpec.Resources) > 0 { + headPodResource = cluster.Spec.HeadGroupSpec.Resources + } else { + // Otherwise, calculate resources from the pod template spec. + headPodResource = CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) + } minResourcesList = append(minResourcesList, headPodResource) + for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs { - podResource := CalculatePodResource(nodeGroup.Template.Spec) + // Prioritize the top-level worker group `Resources` field if specified. + var podResource corev1.ResourceList + if len(nodeGroup.Resources) > 0 { + podResource = nodeGroup.Resources.DeepCopy() + } else { + // Otherwise, calculate resources from the pod template spec. + podResource = CalculatePodResource(nodeGroup.Template.Spec) + } calculateReplicaResource(&podResource, nodeGroup.NumOfHosts) for i := int32(0); i < *nodeGroup.MinReplicas; i++ { minResourcesList = append(minResourcesList, podResource) diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 851e37af3ea..5f37dd43357 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -1051,13 +1051,15 @@ func createPodSpec(cpu, memory string) corev1.PodSpec { func createRayClusterTemplate( head struct { - cpu string - memory string + resources corev1.ResourceList + cpu string + memory string }, workers []struct { replicas *int32 minReplicas *int32 suspend *bool + resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1066,6 +1068,7 @@ func createRayClusterTemplate( cluster := &rayv1.RayCluster{ Spec: rayv1.RayClusterSpec{ HeadGroupSpec: rayv1.HeadGroupSpec{ + Resources: head.resources, Template: corev1.PodTemplateSpec{ Spec: createPodSpec(head.cpu, head.memory), }, @@ -1079,6 +1082,7 @@ func createRayClusterTemplate( Replicas: w.replicas, MinReplicas: w.minReplicas, Suspend: w.suspend, + Resources: w.resources, Template: corev1.PodTemplateSpec{ Spec: createPodSpec(w.cpu, w.memory), }, @@ -1090,8 +1094,9 @@ func createRayClusterTemplate( func TestCalculateResources(t *testing.T) { headStruct := struct { - cpu string - memory string + resources corev1.ResourceList + cpu string + memory string }{ cpu: "1", memory: "100Mi", @@ -1128,6 +1133,7 @@ func TestCalculateResources(t *testing.T) { replicas *int32 minReplicas *int32 suspend *bool + resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1161,6 +1167,7 @@ func TestCalculateResources(t *testing.T) { replicas *int32 minReplicas *int32 suspend *bool + resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1202,6 +1209,7 @@ func TestCalculateResources(t *testing.T) { replicas *int32 minReplicas *int32 suspend *bool + resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1229,6 +1237,96 @@ func TestCalculateResources(t *testing.T) { }, }, }, + { + name: "Top-level Head Resources should take precedence", + cluster: createRayClusterTemplate( + struct { + resources corev1.ResourceList + cpu string + memory string + }{ + cpu: "1", + memory: "100Mi", + resources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("5"), + corev1.ResourceMemory: resource.MustParse("500Mi"), + }, + }, + []struct { + replicas *int32 + minReplicas *int32 + suspend *bool + resources corev1.ResourceList + cpu string + memory string + numOfHosts int32 + }{ + { + numOfHosts: 1, + replicas: ptr.To[int32](2), + minReplicas: ptr.To[int32](1), + cpu: "1", + memory: "1Gi", + }, + }, + ), + expected: struct { + desiredResources corev1.ResourceList + minResources corev1.ResourceList + }{ + desiredResources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("7"), + corev1.ResourceMemory: resource.MustParse("2548Mi"), + }, + minResources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("6"), + corev1.ResourceMemory: resource.MustParse("1524Mi"), + }, + }, + }, + { + name: "Top-level Worker Resources should take precedence", + cluster: createRayClusterTemplate( + headStruct, + []struct { + replicas *int32 + minReplicas *int32 + suspend *bool + resources corev1.ResourceList + cpu string + memory string + numOfHosts int32 + }{ + { + numOfHosts: 1, + replicas: ptr.To[int32](3), + minReplicas: ptr.To[int32](2), + cpu: "1", + memory: "1Gi", + resources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + "TPU": resource.MustParse("4"), + }, + }, + }, + ), + expected: struct { + desiredResources corev1.ResourceList + minResources corev1.ResourceList + }{ + desiredResources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("13"), + corev1.ResourceMemory: resource.MustParse("12388Mi"), + "TPU": resource.MustParse("12"), + }, + minResources: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("9"), + corev1.ResourceMemory: resource.MustParse("8292Mi"), + "TPU": resource.MustParse("8"), + }, + }, + }, } for _, tt := range tests { diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go index 6970ff2634a..65ef806e577 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go @@ -13,6 +13,8 @@ type HeadGroupSpecApplyConfiguration struct { Template *corev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"` HeadService *apicorev1.Service `json:"headService,omitempty"` EnableIngress *bool `json:"enableIngress,omitempty"` + Resources *apicorev1.ResourceList `json:"resources,omitempty"` + Labels map[string]string `json:"labels,omitempty"` RayStartParams map[string]string `json:"rayStartParams,omitempty"` ServiceType *apicorev1.ServiceType `json:"serviceType,omitempty"` } @@ -47,6 +49,28 @@ func (b *HeadGroupSpecApplyConfiguration) WithEnableIngress(value bool) *HeadGro return b } +// WithResources sets the Resources field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Resources field is set to the value of the last call. +func (b *HeadGroupSpecApplyConfiguration) WithResources(value apicorev1.ResourceList) *HeadGroupSpecApplyConfiguration { + b.Resources = &value + return b +} + +// WithLabels puts the entries into the Labels field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Labels field, +// overwriting an existing map entries in Labels field with the same key. +func (b *HeadGroupSpecApplyConfiguration) WithLabels(entries map[string]string) *HeadGroupSpecApplyConfiguration { + if b.Labels == nil && len(entries) > 0 { + b.Labels = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.Labels[k] = v + } + return b +} + // WithRayStartParams puts the entries into the RayStartParams field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, the entries provided by each call will be put on the RayStartParams field, diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go index 85199596806..e75378c518d 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go @@ -3,22 +3,25 @@ package v1 import ( - corev1 "k8s.io/client-go/applyconfigurations/core/v1" + corev1 "k8s.io/api/core/v1" + applyconfigurationscorev1 "k8s.io/client-go/applyconfigurations/core/v1" ) // WorkerGroupSpecApplyConfiguration represents a declarative configuration of the WorkerGroupSpec type for use // with apply. type WorkerGroupSpecApplyConfiguration struct { - Suspend *bool `json:"suspend,omitempty"` - GroupName *string `json:"groupName,omitempty"` - Replicas *int32 `json:"replicas,omitempty"` - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas *int32 `json:"maxReplicas,omitempty"` - IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` - RayStartParams map[string]string `json:"rayStartParams,omitempty"` - Template *corev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"` - ScaleStrategy *ScaleStrategyApplyConfiguration `json:"scaleStrategy,omitempty"` - NumOfHosts *int32 `json:"numOfHosts,omitempty"` + Suspend *bool `json:"suspend,omitempty"` + GroupName *string `json:"groupName,omitempty"` + Replicas *int32 `json:"replicas,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas *int32 `json:"maxReplicas,omitempty"` + IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` + Resources *corev1.ResourceList `json:"resources,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + RayStartParams map[string]string `json:"rayStartParams,omitempty"` + Template *applyconfigurationscorev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"` + ScaleStrategy *ScaleStrategyApplyConfiguration `json:"scaleStrategy,omitempty"` + NumOfHosts *int32 `json:"numOfHosts,omitempty"` } // WorkerGroupSpecApplyConfiguration constructs a declarative configuration of the WorkerGroupSpec type for use with @@ -75,6 +78,28 @@ func (b *WorkerGroupSpecApplyConfiguration) WithIdleTimeoutSeconds(value int32) return b } +// WithResources sets the Resources field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Resources field is set to the value of the last call. +func (b *WorkerGroupSpecApplyConfiguration) WithResources(value corev1.ResourceList) *WorkerGroupSpecApplyConfiguration { + b.Resources = &value + return b +} + +// WithLabels puts the entries into the Labels field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Labels field, +// overwriting an existing map entries in Labels field with the same key. +func (b *WorkerGroupSpecApplyConfiguration) WithLabels(entries map[string]string) *WorkerGroupSpecApplyConfiguration { + if b.Labels == nil && len(entries) > 0 { + b.Labels = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.Labels[k] = v + } + return b +} + // WithRayStartParams puts the entries into the RayStartParams field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, the entries provided by each call will be put on the RayStartParams field, @@ -92,7 +117,7 @@ func (b *WorkerGroupSpecApplyConfiguration) WithRayStartParams(entries map[strin // WithTemplate sets the Template field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Template field is set to the value of the last call. -func (b *WorkerGroupSpecApplyConfiguration) WithTemplate(value *corev1.PodTemplateSpecApplyConfiguration) *WorkerGroupSpecApplyConfiguration { +func (b *WorkerGroupSpecApplyConfiguration) WithTemplate(value *applyconfigurationscorev1.PodTemplateSpecApplyConfiguration) *WorkerGroupSpecApplyConfiguration { b.Template = value return b } From d75cfee3f16d81cc0771887c95c5460ca728b54f Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 2 Oct 2025 05:28:56 +0000 Subject: [PATCH 2/6] Update API comment Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 4 ++-- ray-operator/apis/ray/v1/raycluster_types.go | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index bf3dbef74a2..9f5da834a4d 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -138,7 +138,7 @@ _Appears in:_ | `template` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | Template is the exact pod template used in K8s deployments, statefulsets, etc. | | | | `headService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | HeadService is the Kubernetes service of the head pod. | | | | `enableIngress` _boolean_ | EnableIngress indicates whether operator should create ingress object for head service or not. | | | -| `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for the head group.
These labels will also be added to the Pods of this head group. | | | +| `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for the head group.
These labels will also be added to the Pods of this head group and override the `--labels`
argument passed to `rayStartParams`. | | | | `rayStartParams` _object (keys:string, values:string)_ | RayStartParams are the params of the start command: node-manager-port, object-store-memory, ... | | | | `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | | @@ -418,7 +418,7 @@ _Appears in:_ | `minReplicas` _integer_ | MinReplicas denotes the minimum number of desired Pods for this worker group. | 0 | | | `maxReplicas` _integer_ | MaxReplicas denotes the maximum number of desired Pods for this worker group, and the default value is maxInt32. | 2147483647 | | | `idleTimeoutSeconds` _integer_ | IdleTimeoutSeconds denotes the number of seconds to wait before the v2 autoscaler terminates an idle worker pod of this type.
This value is only used with the Ray Autoscaler enabled and defaults to the value set by the AutoscalingConfig if not specified for this worker group. | | | -| `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for this worker group.
These labels will also be added to the Pods of this worker group. | | | +| `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for this worker group.
These labels will also be added to the Pods of this worker group and override the `--labels`
argument passed to `rayStartParams`. | | | | `rayStartParams` _object (keys:string, values:string)_ | RayStartParams are the params of the start command: address, object-store-memory, ... | | | | `template` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | Template is a pod template for the worker | | | | `scaleStrategy` _[ScaleStrategy](#scalestrategy)_ | ScaleStrategy defines which pods to remove | | | diff --git a/ray-operator/apis/ray/v1/raycluster_types.go b/ray-operator/apis/ray/v1/raycluster_types.go index f7e944718ac..4d90b74ee05 100644 --- a/ray-operator/apis/ray/v1/raycluster_types.go +++ b/ray-operator/apis/ray/v1/raycluster_types.go @@ -76,10 +76,12 @@ type HeadGroupSpec struct { // +optional EnableIngress *bool `json:"enableIngress,omitempty"` // Resources specifies the resource quantities for the head group. + // These values override the resources passed to `rayStartParams` for the group. // +optional Resources corev1.ResourceList `json:"resources,omitempty"` // Labels specifies the Ray node labels for the head group. - // These labels will also be added to the Pods of this head group. + // These labels will also be added to the Pods of this head group and override the `--labels` + // argument passed to `rayStartParams`. // +optional Labels map[string]string `json:"labels,omitempty"` // RayStartParams are the params of the start command: node-manager-port, object-store-memory, ... @@ -114,10 +116,12 @@ type WorkerGroupSpec struct { // +optional IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` // Resources specifies the resource quantities for this worker group. + // These values override the resources passed to `rayStartParams` for the group. // +optional Resources corev1.ResourceList `json:"resources,omitempty"` // Labels specifies the Ray node labels for this worker group. - // These labels will also be added to the Pods of this worker group. + // These labels will also be added to the Pods of this worker group and override the `--labels` + // argument passed to `rayStartParams`. // +optional Labels map[string]string `json:"labels,omitempty"` // RayStartParams are the params of the start command: address, object-store-memory, ... From ee7b3badbe52456c40e1bbba67f5e3c7ff6429ee Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 3 Oct 2025 16:07:37 +0000 Subject: [PATCH 3/6] Fix comments Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 2 + .../crds/ray.io_rayclusters.yaml | 12 +- .../kuberay-operator/crds/ray.io_rayjobs.yaml | 12 +- .../crds/ray.io_rayservices.yaml | 12 +- ray-operator/apis/ray/v1/raycluster_types.go | 10 +- .../apis/ray/v1/zz_generated.deepcopy.go | 8 +- .../config/crd/bases/ray.io_rayclusters.yaml | 12 +- .../config/crd/bases/ray.io_rayjobs.yaml | 12 +- .../config/crd/bases/ray.io_rayservices.yaml | 12 +- ray-operator/controllers/ray/common/pod.go | 103 +++++++++-------- .../controllers/ray/common/pod_test.go | 30 ++--- ray-operator/controllers/ray/utils/util.go | 38 +------ .../controllers/ray/utils/util_test.go | 106 +----------------- .../ray/v1/headgroupspec.go | 18 ++- .../ray/v1/workergroupspec.go | 45 ++++---- 15 files changed, 143 insertions(+), 289 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 9f5da834a4d..f692742d247 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -138,6 +138,7 @@ _Appears in:_ | `template` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | Template is the exact pod template used in K8s deployments, statefulsets, etc. | | | | `headService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | HeadService is the Kubernetes service of the head pod. | | | | `enableIngress` _boolean_ | EnableIngress indicates whether operator should create ingress object for head service or not. | | | +| `resources` _object (keys:string, values:string)_ | Resources specifies the resource quantities for the head group.
These values override the resources passed to `rayStartParams` for the group, but
have no effect on the resources set at the K8s Pod container level. | | | | `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for the head group.
These labels will also be added to the Pods of this head group and override the `--labels`
argument passed to `rayStartParams`. | | | | `rayStartParams` _object (keys:string, values:string)_ | RayStartParams are the params of the start command: node-manager-port, object-store-memory, ... | | | | `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | | @@ -418,6 +419,7 @@ _Appears in:_ | `minReplicas` _integer_ | MinReplicas denotes the minimum number of desired Pods for this worker group. | 0 | | | `maxReplicas` _integer_ | MaxReplicas denotes the maximum number of desired Pods for this worker group, and the default value is maxInt32. | 2147483647 | | | `idleTimeoutSeconds` _integer_ | IdleTimeoutSeconds denotes the number of seconds to wait before the v2 autoscaler terminates an idle worker pod of this type.
This value is only used with the Ray Autoscaler enabled and defaults to the value set by the AutoscalingConfig if not specified for this worker group. | | | +| `resources` _object (keys:string, values:string)_ | Resources specifies the resource quantities for this worker group.
These values override the resources passed to `rayStartParams` for the group, but
have no effect on the resources set at the K8s Pod container level. | | | | `labels` _object (keys:string, values:string)_ | Labels specifies the Ray node labels for this worker group.
These labels will also be added to the Pods of this worker group and override the `--labels`
argument passed to `rayStartParams`. | | | | `rayStartParams` _object (keys:string, values:string)_ | RayStartParams are the params of the start command: address, object-store-memory, ... | | | | `template` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | Template is a pod template for the worker | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml index bb6fe99d1a6..29a303bbb35 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml @@ -641,11 +641,7 @@ spec: type: object resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object serviceType: type: string @@ -4370,11 +4366,7 @@ spec: type: integer resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object scaleStrategy: properties: diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index 4d431edf230..1cc94efd59d 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -691,11 +691,7 @@ spec: type: object resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object serviceType: type: string @@ -4420,11 +4416,7 @@ spec: type: integer resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object scaleStrategy: properties: diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 8c74a4edfaf..e2d61172a3c 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -621,11 +621,7 @@ spec: type: object resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object serviceType: type: string @@ -4350,11 +4346,7 @@ spec: type: integer resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object scaleStrategy: properties: diff --git a/ray-operator/apis/ray/v1/raycluster_types.go b/ray-operator/apis/ray/v1/raycluster_types.go index 4d90b74ee05..6a6d40b8278 100644 --- a/ray-operator/apis/ray/v1/raycluster_types.go +++ b/ray-operator/apis/ray/v1/raycluster_types.go @@ -76,9 +76,10 @@ type HeadGroupSpec struct { // +optional EnableIngress *bool `json:"enableIngress,omitempty"` // Resources specifies the resource quantities for the head group. - // These values override the resources passed to `rayStartParams` for the group. + // These values override the resources passed to `rayStartParams` for the group, but + // have no effect on the resources set at the K8s Pod container level. // +optional - Resources corev1.ResourceList `json:"resources,omitempty"` + Resources map[string]string `json:"resources,omitempty"` // Labels specifies the Ray node labels for the head group. // These labels will also be added to the Pods of this head group and override the `--labels` // argument passed to `rayStartParams`. @@ -116,9 +117,10 @@ type WorkerGroupSpec struct { // +optional IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` // Resources specifies the resource quantities for this worker group. - // These values override the resources passed to `rayStartParams` for the group. + // These values override the resources passed to `rayStartParams` for the group, but + // have no effect on the resources set at the K8s Pod container level. // +optional - Resources corev1.ResourceList `json:"resources,omitempty"` + Resources map[string]string `json:"resources,omitempty"` // Labels specifies the Ray node labels for this worker group. // These labels will also be added to the Pods of this worker group and override the `--labels` // argument passed to `rayStartParams`. diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index 7a60f940887..b5f981b91a4 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -181,9 +181,9 @@ func (in *HeadGroupSpec) DeepCopyInto(out *HeadGroupSpec) { } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = make(corev1.ResourceList, len(*in)) + *out = make(map[string]string, len(*in)) for key, val := range *in { - (*out)[key] = val.DeepCopy() + (*out)[key] = val } } if in.Labels != nil { @@ -843,9 +843,9 @@ func (in *WorkerGroupSpec) DeepCopyInto(out *WorkerGroupSpec) { } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = make(corev1.ResourceList, len(*in)) + *out = make(map[string]string, len(*in)) for key, val := range *in { - (*out)[key] = val.DeepCopy() + (*out)[key] = val } } if in.Labels != nil { diff --git a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml index bb6fe99d1a6..29a303bbb35 100644 --- a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml @@ -641,11 +641,7 @@ spec: type: object resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object serviceType: type: string @@ -4370,11 +4366,7 @@ spec: type: integer resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object scaleStrategy: properties: diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index 4d431edf230..1cc94efd59d 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -691,11 +691,7 @@ spec: type: object resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object serviceType: type: string @@ -4420,11 +4416,7 @@ spec: type: integer resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object scaleStrategy: properties: diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 8c74a4edfaf..e2d61172a3c 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -621,11 +621,7 @@ spec: type: object resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object serviceType: type: string @@ -4350,11 +4346,7 @@ spec: type: integer resources: additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + type: string type: object scaleStrategy: properties: diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 8d4cf5ea9e2..94ba0ebdd1d 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -172,11 +172,11 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head // This ensures privilege of KubeRay users are contained within the namespace of the RayCluster. podTemplate.ObjectMeta.Namespace = instance.Namespace - // Reconcile top-level Resources for head group with rayStartParams. - reconcileRayStartParamsResources(headSpec.RayStartParams, headSpec.Resources) + // Update rayStartParams with top-level Resources for head group. + updateRayStartParamsResources(ctx, headSpec.RayStartParams, headSpec.Resources) - // Reconcile top-level Labels for head group with `--labels` in rayStartParams. - reconcileRayStartParamsLabels(headSpec.RayStartParams, headSpec.Labels) + // Update --labels` in rayStartParams with top-level Labels for head group. + updateRayStartParamsLabels(headSpec.RayStartParams, headSpec.Labels) // Merge K8s labels from the Pod template and the top-level `Labels` field. mergedLabels := mergeLabels(headSpec.Template.ObjectMeta.Labels, headSpec.Labels) @@ -318,11 +318,11 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo // Hence, we set `ObjectMeta.Name` to an empty string, and use GenerateName to prevent name conflicts. podTemplate.ObjectMeta.Name = "" - // Reconcile top-level Resources for worker group with rayStartParams. - reconcileRayStartParamsResources(workerSpec.RayStartParams, workerSpec.Resources) + // Update rayStartParams with top-level Resources for worker group. + updateRayStartParamsResources(ctx, workerSpec.RayStartParams, workerSpec.Resources) - // Reconcile top-level Labels for worker group with `--labels` in rayStartParams. - reconcileRayStartParamsLabels(workerSpec.RayStartParams, workerSpec.Labels) + // Update --labels` in rayStartParams with top-level Labels for worker group. + updateRayStartParamsLabels(workerSpec.RayStartParams, workerSpec.Labels) // Merge K8s labels from the Pod template and the top-level `Labels` field. mergedLabels := mergeLabels(workerSpec.Template.ObjectMeta.Labels, workerSpec.Labels) @@ -1040,58 +1040,71 @@ func findMemoryReqOrLimit(container corev1.Container) (res *resource.Quantity) { return nil } -// reconcileRayStartParamsLabels reconciles `--labels` in rayStartParams based on group `Labels`. -func reconcileRayStartParamsLabels(rayStartParams map[string]string, groupLabels map[string]string) { - if len(groupLabels) > 0 { - var labels []string - // Sort label keys for deterministic output. - keys := make([]string, 0, len(groupLabels)) - for k := range groupLabels { - keys = append(keys, k) - } - sort.Strings(keys) +// updateRayStartParamsLabels reconciles `--labels` in rayStartParams based on group `Labels`. +func updateRayStartParamsLabels(rayStartParams map[string]string, groupLabels map[string]string) { + if len(groupLabels) == 0 { + return + } + var labels []string + // Sort label keys for deterministic output. + keys := make([]string, 0, len(groupLabels)) + for k := range groupLabels { + keys = append(keys, k) + } + sort.Strings(keys) - for _, k := range keys { - labels = append(labels, fmt.Sprintf("%s=%s", k, groupLabels[k])) - } - // When provided this will override any `--labels` value specified in rayStartParams. - rayStartParams["labels"] = strings.Join(labels, ",") + for _, k := range keys { + labels = append(labels, fmt.Sprintf("%s=%s", k, groupLabels[k])) } + // When provided this will override any `--labels` value specified in rayStartParams. + rayStartParams["labels"] = strings.Join(labels, ",") } -// reconcileRayStartParamsResources reconciles rayStartParams based on the top-level `Resources` field. -func reconcileRayStartParamsResources(rayStartParams map[string]string, topLevelResources corev1.ResourceList) { - if len(topLevelResources) > 0 { - // Override relevant rayStartParams fields to ensure consistency. - rayResourcesJson := make(map[string]float64) - for name, quantity := range topLevelResources { - strName := string(name) - if name == corev1.ResourceCPU { - rayStartParams["num-cpus"] = strconv.FormatInt(quantity.Value(), 10) - } else if name == corev1.ResourceMemory { - rayStartParams["memory"] = strconv.FormatInt(quantity.Value(), 10) - } else if utils.IsGPUResourceKey(strName) { - rayStartParams["num-gpus"] = strconv.FormatInt(quantity.Value(), 10) - } else { - rayResourcesJson[strName] = quantity.AsApproximateFloat64() - } +// updateRayStartParamsResources reconciles rayStartParams based on the top-level `Resources` field. +func updateRayStartParamsResources(ctx context.Context, rayStartParams map[string]string, groupResources map[string]string) { + log := ctrl.LoggerFrom(ctx) + + if len(groupResources) == 0 { + return + } + // Override relevant rayStartParams fields to ensure consistency. + rayResourcesJson := make(map[string]float64) + for name, quantity := range groupResources { + q, err := resource.ParseQuantity(quantity) + if err != nil { + log.Info("Skipping resource %s: failed to parse quantity '%s': %v", name, quantity, err) + continue } - if len(rayResourcesJson) > 0 { - jsonBytes, _ := json.Marshal(rayResourcesJson) - rayStartParams["resources"] = fmt.Sprintf("'%s'", string(jsonBytes)) + if name == string(corev1.ResourceCPU) { + rayStartParams["num-cpus"] = strconv.FormatInt(q.Value(), 10) + } else if name == string(corev1.ResourceMemory) { + rayStartParams["memory"] = strconv.FormatInt(q.Value(), 10) + } else if utils.IsGPUResourceKey(name) { + rayStartParams["num-gpus"] = strconv.FormatInt(q.Value(), 10) + } else { + rayResourcesJson[name] = q.AsApproximateFloat64() + } + } + + if len(rayResourcesJson) > 0 { + jsonBytes, err := json.Marshal(rayResourcesJson) + if err != nil { + log.Error(err, "Failed to marshal Ray Resources JSON for rayStartParams.") + return } + rayStartParams["resources"] = fmt.Sprintf("'%s'", string(jsonBytes)) } } -// mergeLabels combines labels from a pod template and a top-level `labels` spec, +// mergeLabels combines labels from a pod template and a group `labels` spec, // with the top-level labels field taking precedence. -func mergeLabels(templateLabels map[string]string, topLevelLabels map[string]string) map[string]string { +func mergeLabels(templateLabels map[string]string, groupLabels map[string]string) map[string]string { merged := make(map[string]string) for k, v := range templateLabels { merged[k] = v } - for k, v := range topLevelLabels { + for k, v := range groupLabels { merged[k] = v } return merged diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index 12c79c363e4..8b26b4bee5f 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1930,7 +1930,7 @@ func TestMergeLabels(t *testing.T) { } } -func TestReconcileRayStartParamsLabels(t *testing.T) { +func TestUpdateRayStartParamsLabels(t *testing.T) { tests := map[string]struct { initialRayStartParams map[string]string groupLabels map[string]string @@ -1981,17 +1981,19 @@ func TestReconcileRayStartParamsLabels(t *testing.T) { rayStartParams[k] = v } - reconcileRayStartParamsLabels(rayStartParams, tc.groupLabels) + updateRayStartParamsLabels(rayStartParams, tc.groupLabels) assert.Equal(t, tc.expectedRayStartParams, rayStartParams) }) } } -func TestReconcileRayStartParamsResources(t *testing.T) { +func TestUpdateRayStartParamsResources(t *testing.T) { + ctx := context.Background() + tests := map[string]struct { initialRayStartParams map[string]string - groupResources corev1.ResourceList + groupResources map[string]string expectedRayStartParams map[string]string expectedK8sResources corev1.ResourceList }{ @@ -2002,9 +2004,9 @@ func TestReconcileRayStartParamsResources(t *testing.T) { }, "Basic CPU and Memory set in `Resources` override rayStartParams": { initialRayStartParams: map[string]string{}, - groupResources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("4Gi"), + groupResources: map[string]string{ + string(corev1.ResourceCPU): "2", + string(corev1.ResourceMemory): "4Gi", }, expectedRayStartParams: map[string]string{ "num-cpus": "2", @@ -2013,9 +2015,9 @@ func TestReconcileRayStartParamsResources(t *testing.T) { }, "GPU and custom TPU resource set in `Resources` override rayStartParams": { initialRayStartParams: map[string]string{}, - groupResources: corev1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - "TPU": resource.MustParse("4"), + groupResources: map[string]string{ + "nvidia.com/gpu": "1", + "TPU": "4", }, expectedRayStartParams: map[string]string{ "num-gpus": "1", @@ -2028,9 +2030,9 @@ func TestReconcileRayStartParamsResources(t *testing.T) { "memory": "1000", "resources": "'{\"Custom-Resource\": 10}'", }, - groupResources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("4"), - "Custom-Resource": resource.MustParse("5"), + groupResources: map[string]string{ + string(corev1.ResourceCPU): "4", + "Custom-Resource": "5", }, expectedRayStartParams: map[string]string{ "num-cpus": "4", @@ -2047,7 +2049,7 @@ func TestReconcileRayStartParamsResources(t *testing.T) { rayStartParams[k] = v } - reconcileRayStartParamsResources(rayStartParams, tc.groupResources) + updateRayStartParamsResources(ctx, rayStartParams, tc.groupResources) // Verify rayStartParams are updated based on the top-level Resources values. assert.Equal(t, tc.expectedRayStartParams, rayStartParams) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 79aa4dcca99..cf6b9066323 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -428,28 +428,13 @@ func CalculateAvailableReplicas(pods corev1.PodList) int32 { func CalculateDesiredResources(cluster *rayv1.RayCluster) corev1.ResourceList { desiredResourcesList := []corev1.ResourceList{{}} - var headPodResource corev1.ResourceList - // Prioritize the top-level head group `Resources` field if specified. - if len(cluster.Spec.HeadGroupSpec.Resources) > 0 { - headPodResource = cluster.Spec.HeadGroupSpec.Resources - } else { - // Otherwise, calculate resources from the pod template spec. - headPodResource = CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) - } + headPodResource := CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) desiredResourcesList = append(desiredResourcesList, headPodResource) - for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs { if nodeGroup.Suspend != nil && *nodeGroup.Suspend { continue } - // Prioritize the top-level worker group `Resources` field if specified. - var podResource corev1.ResourceList - if len(nodeGroup.Resources) > 0 { - podResource = nodeGroup.Resources.DeepCopy() - } else { - // Otherwise, calculate resources from the pod template spec. - podResource = CalculatePodResource(nodeGroup.Template.Spec) - } + podResource := CalculatePodResource(nodeGroup.Template.Spec) calculateReplicaResource(&podResource, nodeGroup.NumOfHosts) for i := int32(0); i < *nodeGroup.Replicas; i++ { desiredResourcesList = append(desiredResourcesList, podResource) @@ -460,25 +445,10 @@ func CalculateDesiredResources(cluster *rayv1.RayCluster) corev1.ResourceList { func CalculateMinResources(cluster *rayv1.RayCluster) corev1.ResourceList { minResourcesList := []corev1.ResourceList{{}} - var headPodResource corev1.ResourceList - // Prioritize the top-level head group `Resources` field if specified. - if len(cluster.Spec.HeadGroupSpec.Resources) > 0 { - headPodResource = cluster.Spec.HeadGroupSpec.Resources - } else { - // Otherwise, calculate resources from the pod template spec. - headPodResource = CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) - } + headPodResource := CalculatePodResource(cluster.Spec.HeadGroupSpec.Template.Spec) minResourcesList = append(minResourcesList, headPodResource) - for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs { - // Prioritize the top-level worker group `Resources` field if specified. - var podResource corev1.ResourceList - if len(nodeGroup.Resources) > 0 { - podResource = nodeGroup.Resources.DeepCopy() - } else { - // Otherwise, calculate resources from the pod template spec. - podResource = CalculatePodResource(nodeGroup.Template.Spec) - } + podResource := CalculatePodResource(nodeGroup.Template.Spec) calculateReplicaResource(&podResource, nodeGroup.NumOfHosts) for i := int32(0); i < *nodeGroup.MinReplicas; i++ { minResourcesList = append(minResourcesList, podResource) diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 5f37dd43357..851e37af3ea 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -1051,15 +1051,13 @@ func createPodSpec(cpu, memory string) corev1.PodSpec { func createRayClusterTemplate( head struct { - resources corev1.ResourceList - cpu string - memory string + cpu string + memory string }, workers []struct { replicas *int32 minReplicas *int32 suspend *bool - resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1068,7 +1066,6 @@ func createRayClusterTemplate( cluster := &rayv1.RayCluster{ Spec: rayv1.RayClusterSpec{ HeadGroupSpec: rayv1.HeadGroupSpec{ - Resources: head.resources, Template: corev1.PodTemplateSpec{ Spec: createPodSpec(head.cpu, head.memory), }, @@ -1082,7 +1079,6 @@ func createRayClusterTemplate( Replicas: w.replicas, MinReplicas: w.minReplicas, Suspend: w.suspend, - Resources: w.resources, Template: corev1.PodTemplateSpec{ Spec: createPodSpec(w.cpu, w.memory), }, @@ -1094,9 +1090,8 @@ func createRayClusterTemplate( func TestCalculateResources(t *testing.T) { headStruct := struct { - resources corev1.ResourceList - cpu string - memory string + cpu string + memory string }{ cpu: "1", memory: "100Mi", @@ -1133,7 +1128,6 @@ func TestCalculateResources(t *testing.T) { replicas *int32 minReplicas *int32 suspend *bool - resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1167,7 +1161,6 @@ func TestCalculateResources(t *testing.T) { replicas *int32 minReplicas *int32 suspend *bool - resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1209,7 +1202,6 @@ func TestCalculateResources(t *testing.T) { replicas *int32 minReplicas *int32 suspend *bool - resources corev1.ResourceList cpu string memory string numOfHosts int32 @@ -1237,96 +1229,6 @@ func TestCalculateResources(t *testing.T) { }, }, }, - { - name: "Top-level Head Resources should take precedence", - cluster: createRayClusterTemplate( - struct { - resources corev1.ResourceList - cpu string - memory string - }{ - cpu: "1", - memory: "100Mi", - resources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("5"), - corev1.ResourceMemory: resource.MustParse("500Mi"), - }, - }, - []struct { - replicas *int32 - minReplicas *int32 - suspend *bool - resources corev1.ResourceList - cpu string - memory string - numOfHosts int32 - }{ - { - numOfHosts: 1, - replicas: ptr.To[int32](2), - minReplicas: ptr.To[int32](1), - cpu: "1", - memory: "1Gi", - }, - }, - ), - expected: struct { - desiredResources corev1.ResourceList - minResources corev1.ResourceList - }{ - desiredResources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("7"), - corev1.ResourceMemory: resource.MustParse("2548Mi"), - }, - minResources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("6"), - corev1.ResourceMemory: resource.MustParse("1524Mi"), - }, - }, - }, - { - name: "Top-level Worker Resources should take precedence", - cluster: createRayClusterTemplate( - headStruct, - []struct { - replicas *int32 - minReplicas *int32 - suspend *bool - resources corev1.ResourceList - cpu string - memory string - numOfHosts int32 - }{ - { - numOfHosts: 1, - replicas: ptr.To[int32](3), - minReplicas: ptr.To[int32](2), - cpu: "1", - memory: "1Gi", - resources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("4"), - corev1.ResourceMemory: resource.MustParse("4Gi"), - "TPU": resource.MustParse("4"), - }, - }, - }, - ), - expected: struct { - desiredResources corev1.ResourceList - minResources corev1.ResourceList - }{ - desiredResources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("13"), - corev1.ResourceMemory: resource.MustParse("12388Mi"), - "TPU": resource.MustParse("12"), - }, - minResources: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("9"), - corev1.ResourceMemory: resource.MustParse("8292Mi"), - "TPU": resource.MustParse("8"), - }, - }, - }, } for _, tt := range tests { diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go index 65ef806e577..c5233d65732 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/headgroupspec.go @@ -13,7 +13,7 @@ type HeadGroupSpecApplyConfiguration struct { Template *corev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"` HeadService *apicorev1.Service `json:"headService,omitempty"` EnableIngress *bool `json:"enableIngress,omitempty"` - Resources *apicorev1.ResourceList `json:"resources,omitempty"` + Resources map[string]string `json:"resources,omitempty"` Labels map[string]string `json:"labels,omitempty"` RayStartParams map[string]string `json:"rayStartParams,omitempty"` ServiceType *apicorev1.ServiceType `json:"serviceType,omitempty"` @@ -49,11 +49,17 @@ func (b *HeadGroupSpecApplyConfiguration) WithEnableIngress(value bool) *HeadGro return b } -// WithResources sets the Resources field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Resources field is set to the value of the last call. -func (b *HeadGroupSpecApplyConfiguration) WithResources(value apicorev1.ResourceList) *HeadGroupSpecApplyConfiguration { - b.Resources = &value +// WithResources puts the entries into the Resources field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Resources field, +// overwriting an existing map entries in Resources field with the same key. +func (b *HeadGroupSpecApplyConfiguration) WithResources(entries map[string]string) *HeadGroupSpecApplyConfiguration { + if b.Resources == nil && len(entries) > 0 { + b.Resources = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.Resources[k] = v + } return b } diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go index e75378c518d..7aa692895f1 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/workergroupspec.go @@ -3,25 +3,24 @@ package v1 import ( - corev1 "k8s.io/api/core/v1" - applyconfigurationscorev1 "k8s.io/client-go/applyconfigurations/core/v1" + corev1 "k8s.io/client-go/applyconfigurations/core/v1" ) // WorkerGroupSpecApplyConfiguration represents a declarative configuration of the WorkerGroupSpec type for use // with apply. type WorkerGroupSpecApplyConfiguration struct { - Suspend *bool `json:"suspend,omitempty"` - GroupName *string `json:"groupName,omitempty"` - Replicas *int32 `json:"replicas,omitempty"` - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas *int32 `json:"maxReplicas,omitempty"` - IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` - Resources *corev1.ResourceList `json:"resources,omitempty"` - Labels map[string]string `json:"labels,omitempty"` - RayStartParams map[string]string `json:"rayStartParams,omitempty"` - Template *applyconfigurationscorev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"` - ScaleStrategy *ScaleStrategyApplyConfiguration `json:"scaleStrategy,omitempty"` - NumOfHosts *int32 `json:"numOfHosts,omitempty"` + Suspend *bool `json:"suspend,omitempty"` + GroupName *string `json:"groupName,omitempty"` + Replicas *int32 `json:"replicas,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas *int32 `json:"maxReplicas,omitempty"` + IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"` + Resources map[string]string `json:"resources,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + RayStartParams map[string]string `json:"rayStartParams,omitempty"` + Template *corev1.PodTemplateSpecApplyConfiguration `json:"template,omitempty"` + ScaleStrategy *ScaleStrategyApplyConfiguration `json:"scaleStrategy,omitempty"` + NumOfHosts *int32 `json:"numOfHosts,omitempty"` } // WorkerGroupSpecApplyConfiguration constructs a declarative configuration of the WorkerGroupSpec type for use with @@ -78,11 +77,17 @@ func (b *WorkerGroupSpecApplyConfiguration) WithIdleTimeoutSeconds(value int32) return b } -// WithResources sets the Resources field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Resources field is set to the value of the last call. -func (b *WorkerGroupSpecApplyConfiguration) WithResources(value corev1.ResourceList) *WorkerGroupSpecApplyConfiguration { - b.Resources = &value +// WithResources puts the entries into the Resources field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Resources field, +// overwriting an existing map entries in Resources field with the same key. +func (b *WorkerGroupSpecApplyConfiguration) WithResources(entries map[string]string) *WorkerGroupSpecApplyConfiguration { + if b.Resources == nil && len(entries) > 0 { + b.Resources = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.Resources[k] = v + } return b } @@ -117,7 +122,7 @@ func (b *WorkerGroupSpecApplyConfiguration) WithRayStartParams(entries map[strin // WithTemplate sets the Template field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Template field is set to the value of the last call. -func (b *WorkerGroupSpecApplyConfiguration) WithTemplate(value *applyconfigurationscorev1.PodTemplateSpecApplyConfiguration) *WorkerGroupSpecApplyConfiguration { +func (b *WorkerGroupSpecApplyConfiguration) WithTemplate(value *corev1.PodTemplateSpecApplyConfiguration) *WorkerGroupSpecApplyConfiguration { b.Template = value return b } From f7f85dd9d536537b3c33e255c440defb9f3bda2e Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 6 Oct 2025 12:40:03 +0000 Subject: [PATCH 4/6] Add validation logic Signed-off-by: Ryan O'Leary --- .../controllers/ray/utils/validation.go | 36 ++++++ .../controllers/ray/utils/validation_test.go | 121 ++++++++++++++++++ 2 files changed, 157 insertions(+) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 74d2b4fe0e6..7d44aee3fa2 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -3,6 +3,7 @@ package utils import ( errstd "errors" "fmt" + "strings" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" @@ -33,16 +34,51 @@ func ValidateRayClusterMetadata(metadata metav1.ObjectMeta) error { return nil } +// validateRayGroupResourcesAndLabels checks for conflicting resource definitions and invalid labels. +func validateRayGroupResourcesAndLabels(groupName string, rayStartParams, resources, labels map[string]string) error { + hasRayStartResources := rayStartParams["num-cpus"] != "" || + rayStartParams["num-gpus"] != "" || + rayStartParams["memory"] != "" || + rayStartParams["resources"] != "" + if hasRayStartResources && len(resources) > 0 { + return fmt.Errorf("resource fields should not be set in both rayStartParams and Resources for %s group. Please use only one", groupName) + } + + if _, ok := rayStartParams["labels"]; ok { + return fmt.Errorf("rayStartParams['labels'] is not supported for %s group. Please use the top-level Labels field instead", groupName) + } + + // Validate syntax for the top-level Labels field is parsable. + for key, val := range labels { + if strings.Contains(key, ",") { + return fmt.Errorf("label key for %s group cannot contain commas, but found: '%s'", groupName, key) + } + if strings.Contains(val, ",") { + return fmt.Errorf("label value for key '%s' in %s group cannot contain commas, but found: '%s'", key, groupName, val) + } + } + + return nil +} + // Validation for invalid Ray Cluster configurations. func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]string) error { if len(spec.HeadGroupSpec.Template.Spec.Containers) == 0 { return fmt.Errorf("headGroupSpec should have at least one container") } + if err := validateRayGroupResourcesAndLabels("Head", spec.HeadGroupSpec.RayStartParams, spec.HeadGroupSpec.Resources, spec.HeadGroupSpec.Labels); err != nil { + return err + } + for _, workerGroup := range spec.WorkerGroupSpecs { if len(workerGroup.Template.Spec.Containers) == 0 { return fmt.Errorf("workerGroupSpec should have at least one container") } + + if err := validateRayGroupResourcesAndLabels(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Resources, workerGroup.Labels); err != nil { + return err + } } if annotations[RayFTEnabledAnnotationKey] != "" && spec.GcsFaultToleranceOptions != nil { diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index dc464424f40..7a1bc5ca806 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -683,6 +683,127 @@ func TestValidateRayClusterSpecAutoscaler(t *testing.T) { } } +func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { + // Util function to create a RayCluster spec. + createSpec := func() rayv1.RayClusterSpec { + return rayv1.RayClusterSpec{ + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group", + Template: podTemplateSpec(nil, nil), + }, + }, + } + } + + tests := []struct { + name string + errorMessage string + spec rayv1.RayClusterSpec + expectError bool + }{ + { + name: "Invalid: Head group has resources in both rayStartParams and top-level Resources", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.RayStartParams = map[string]string{"num-cpus": "1"} + s.HeadGroupSpec.Resources = map[string]string{"CPU": "1"} + return s + }(), + expectError: true, + errorMessage: "resource fields should not be set in both rayStartParams and Resources for Head group. Please use only one", + }, + { + name: "Invalid: Worker group has resources in both rayStartParams and .Resources", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.WorkerGroupSpecs[0].RayStartParams = map[string]string{"num-gpus": "1"} + s.WorkerGroupSpecs[0].Resources = map[string]string{"GPU": "1"} + return s + }(), + expectError: true, + errorMessage: "resource fields should not be set in both rayStartParams and Resources for worker-group group. Please use only one", + }, + { + name: "Valid: Only rayStartParams resources are set for head", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.RayStartParams = map[string]string{ + "num-cpus": "2", + "memory": "4G", + "resources": "{\"TPU\": \"8\"}", + } + return s + }(), + expectError: false, + }, + { + name: "Valid: Only .Resources field is set for worker", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.WorkerGroupSpecs[0].Resources = map[string]string{"CPU": "2", "memory": "4G", "TPU": "8"} + return s + }(), + expectError: false, + }, + { + name: "Invalid: Head group has 'labels' in rayStartParams", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.RayStartParams = map[string]string{"labels": "ray.io/node-group=worker-group-1"} + return s + }(), + expectError: true, + errorMessage: "rayStartParams['labels'] is not supported for Head group. Please use the top-level Labels field instead", + }, + { + name: "Invalid: Worker group has 'labels' in rayStartParams", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.WorkerGroupSpecs[0].RayStartParams = map[string]string{"labels": "ray.io/node-group=worker-group-1"} + return s + }(), + expectError: true, + errorMessage: "rayStartParams['labels'] is not supported for worker-group group. Please use the top-level Labels field instead", + }, + { + name: "Valid: Only .Labels field is set", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.Labels = map[string]string{"ray.io/market-type": "on-demand"} + s.WorkerGroupSpecs[0].Labels = map[string]string{"ray.io/accelerator-type": "TPU-V6E"} + return s + }(), + expectError: false, + }, + { + name: "Invalid: .Labels field values contain commas", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.Labels = map[string]string{"invalid,labels,with,commas": "value"} + return s + }(), + expectError: true, + errorMessage: "label key for Head group cannot contain commas, but found: 'invalid,labels,with,commas'", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := ValidateRayClusterSpec(&tt.spec, nil) + if tt.expectError { + require.Error(t, err) + assert.EqualError(t, err, tt.errorMessage) + } else { + require.NoError(t, err) + } + }) + } +} + func TestValidateRayJobStatus(t *testing.T) { tests := []struct { name string From 993939078c4c5ede7119db56741d7942e314db9b Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Wed, 15 Oct 2025 14:15:25 -0700 Subject: [PATCH 5/6] Update ray-operator/controllers/ray/common/pod.go Co-authored-by: Kai-Hsun Chen Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/common/pod.go | 1 - 1 file changed, 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 94ba0ebdd1d..1eb086d2eb2 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -1056,7 +1056,6 @@ func updateRayStartParamsLabels(rayStartParams map[string]string, groupLabels ma for _, k := range keys { labels = append(labels, fmt.Sprintf("%s=%s", k, groupLabels[k])) } - // When provided this will override any `--labels` value specified in rayStartParams. rayStartParams["labels"] = strings.Join(labels, ",") } From 998e2d9de4c992376381744a2c46b0442646fafb Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 16 Oct 2025 00:23:01 +0000 Subject: [PATCH 6/6] Add k8s syntax validation Signed-off-by: Ryan O'Leary --- .../controllers/ray/utils/validation.go | 44 +++++++++---- .../controllers/ray/utils/validation_test.go | 65 ++++++++++++++++--- 2 files changed, 89 insertions(+), 20 deletions(-) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 7d44aee3fa2..f804121591d 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -34,30 +34,46 @@ func ValidateRayClusterMetadata(metadata metav1.ObjectMeta) error { return nil } -// validateRayGroupResourcesAndLabels checks for conflicting resource definitions and invalid labels. -func validateRayGroupResourcesAndLabels(groupName string, rayStartParams, resources, labels map[string]string) error { +// validateRayGroupResources checks for conflicting resource definitions. +func validateRayGroupResources(groupName string, rayStartParams, resources map[string]string) error { hasRayStartResources := rayStartParams["num-cpus"] != "" || rayStartParams["num-gpus"] != "" || rayStartParams["memory"] != "" || rayStartParams["resources"] != "" if hasRayStartResources && len(resources) > 0 { - return fmt.Errorf("resource fields should not be set in both rayStartParams and Resources for %s group. Please use only one", groupName) + return fmt.Errorf("resource fields should not be set in both rayStartParams and Resources for %s group; please use only one", groupName) } + return nil +} +// validateRayGroupLabels checks for invalid label definitions and correct label syntax. +func validateRayGroupLabels(groupName string, rayStartParams, labels map[string]string) error { if _, ok := rayStartParams["labels"]; ok { - return fmt.Errorf("rayStartParams['labels'] is not supported for %s group. Please use the top-level Labels field instead", groupName) + return fmt.Errorf("rayStartParams['labels'] is not supported for %s group; please use the top-level Labels field instead", groupName) } - // Validate syntax for the top-level Labels field is parsable. + // Validate that labels conforms to Kubernetes label syntax. + var allErrs []string for key, val := range labels { - if strings.Contains(key, ",") { - return fmt.Errorf("label key for %s group cannot contain commas, but found: '%s'", groupName, key) + // Validate the label key. + if errs := validation.IsQualifiedName(key); len(errs) > 0 { + for _, err := range errs { + allErrs = append(allErrs, fmt.Sprintf("invalid label key for %s group: '%s', error: %s", groupName, key, err)) + } } - if strings.Contains(val, ",") { - return fmt.Errorf("label value for key '%s' in %s group cannot contain commas, but found: '%s'", key, groupName, val) + + // Validate the label value. + if errs := validation.IsValidLabelValue(val); len(errs) > 0 { + for _, err := range errs { + allErrs = append(allErrs, fmt.Sprintf("invalid label value for key '%s' in %s group: '%s', error: %s", key, groupName, val, err)) + } } } + if len(allErrs) > 0 { + return fmt.Errorf("%s", strings.Join(allErrs, "; ")) + } + return nil } @@ -67,7 +83,10 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s return fmt.Errorf("headGroupSpec should have at least one container") } - if err := validateRayGroupResourcesAndLabels("Head", spec.HeadGroupSpec.RayStartParams, spec.HeadGroupSpec.Resources, spec.HeadGroupSpec.Labels); err != nil { + if err := validateRayGroupResources("Head", spec.HeadGroupSpec.RayStartParams, spec.HeadGroupSpec.Resources); err != nil { + return err + } + if err := validateRayGroupLabels("Head", spec.HeadGroupSpec.RayStartParams, spec.HeadGroupSpec.Labels); err != nil { return err } @@ -76,7 +95,10 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s return fmt.Errorf("workerGroupSpec should have at least one container") } - if err := validateRayGroupResourcesAndLabels(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Resources, workerGroup.Labels); err != nil { + if err := validateRayGroupResources(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Resources); err != nil { + return err + } + if err := validateRayGroupLabels(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Labels); err != nil { return err } } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 7a1bc5ca806..ec9bfb30a7b 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -683,7 +683,7 @@ func TestValidateRayClusterSpecAutoscaler(t *testing.T) { } } -func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { +func TestValidateRayClusterSpec_Resources(t *testing.T) { // Util function to create a RayCluster spec. createSpec := func() rayv1.RayClusterSpec { return rayv1.RayClusterSpec{ @@ -714,7 +714,7 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { return s }(), expectError: true, - errorMessage: "resource fields should not be set in both rayStartParams and Resources for Head group. Please use only one", + errorMessage: "resource fields should not be set in both rayStartParams and Resources for Head group; please use only one", }, { name: "Invalid: Worker group has resources in both rayStartParams and .Resources", @@ -725,7 +725,7 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { return s }(), expectError: true, - errorMessage: "resource fields should not be set in both rayStartParams and Resources for worker-group group. Please use only one", + errorMessage: "resource fields should not be set in both rayStartParams and Resources for worker-group group; please use only one", }, { name: "Valid: Only rayStartParams resources are set for head", @@ -749,6 +749,43 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { }(), expectError: false, }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := ValidateRayClusterSpec(&tt.spec, nil) + if tt.expectError { + require.Error(t, err) + assert.EqualError(t, err, tt.errorMessage) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestValidateRayClusterSpec_Labels(t *testing.T) { + // Util function to create a RayCluster spec. + createSpec := func() rayv1.RayClusterSpec { + return rayv1.RayClusterSpec{ + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group", + Template: podTemplateSpec(nil, nil), + }, + }, + } + } + + tests := []struct { + name string + errorMessage string + spec rayv1.RayClusterSpec + expectError bool + }{ { name: "Invalid: Head group has 'labels' in rayStartParams", spec: func() rayv1.RayClusterSpec { @@ -757,7 +794,7 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { return s }(), expectError: true, - errorMessage: "rayStartParams['labels'] is not supported for Head group. Please use the top-level Labels field instead", + errorMessage: "rayStartParams['labels'] is not supported for Head group; please use the top-level Labels field instead", }, { name: "Invalid: Worker group has 'labels' in rayStartParams", @@ -767,7 +804,7 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { return s }(), expectError: true, - errorMessage: "rayStartParams['labels'] is not supported for worker-group group. Please use the top-level Labels field instead", + errorMessage: "rayStartParams['labels'] is not supported for worker-group group; please use the top-level Labels field instead", }, { name: "Valid: Only .Labels field is set", @@ -780,14 +817,24 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { expectError: false, }, { - name: "Invalid: .Labels field values contain commas", + name: "Invalid: Label key does not follow Kubernetes syntax", spec: func() rayv1.RayClusterSpec { s := createSpec() - s.HeadGroupSpec.Labels = map[string]string{"invalid,labels,with,commas": "value"} + s.WorkerGroupSpecs[0].Labels = map[string]string{"invalid_key!": "value"} return s }(), expectError: true, - errorMessage: "label key for Head group cannot contain commas, but found: 'invalid,labels,with,commas'", + errorMessage: "invalid label key for worker-group group: 'invalid_key!', error: name part must consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character", + }, + { + name: "Invalid: Label value does not follow Kubernetes syntax", + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.Labels = map[string]string{"valid-key": "invalid/value"} + return s + }(), + expectError: true, + errorMessage: "invalid label value for key 'valid-key' in Head group: 'invalid/value', error: a valid label must be an empty string or consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character", }, } @@ -796,7 +843,7 @@ func TestValidateRayClusterSpec_ResourcesAndLabels(t *testing.T) { err := ValidateRayClusterSpec(&tt.spec, nil) if tt.expectError { require.Error(t, err) - assert.EqualError(t, err, tt.errorMessage) + assert.Contains(t, err.Error(), tt.errorMessage) } else { require.NoError(t, err) }