diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index c7ff5ec782a..aa474d35f06 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -103,6 +103,9 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s if err := validateRayGroupLabels(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Labels); err != nil { return err } + if err := validateWorkerGroupIdleTimeout(workerGroup, spec); err != nil { + return err + } } if annotations[RayFTEnabledAnnotationKey] != "" && spec.GcsFaultToleranceOptions != nil { @@ -574,3 +577,18 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { return nil } + +// validateWorkerGroupIdleTimeout validates the idleTimeoutSeconds field in a worker group spec +func validateWorkerGroupIdleTimeout(workerGroup rayv1.WorkerGroupSpec, spec *rayv1.RayClusterSpec) error { + idleTimeoutSeconds := workerGroup.IdleTimeoutSeconds + if idleTimeoutSeconds != nil && *idleTimeoutSeconds < 0 { + return fmt.Errorf("idleTimeoutSeconds must be non-negative, got %d", *idleTimeoutSeconds) + } + + // idleTimeoutSeconds only allowed on autoscaler v2 + if idleTimeoutSeconds != nil && !IsAutoscalingV2Enabled(spec) { + return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but autoscaler version is not v2. Please set .spec.autoscalerOptions.version to v2", workerGroup.GroupName) + } + + return nil +} diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 88aee6c9fc4..720a6306483 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1823,3 +1823,164 @@ func TestValidateClusterUpgradeOptions(t *testing.T) { }) } } + +func TestValidateWorkerGroupIdleTimeout(t *testing.T) { + tests := map[string]struct { + expectedErr string + spec rayv1.RayClusterSpec + }{ + "should accept worker group with valid idleTimeoutSeconds": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + }, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + "should reject negative idleTimeoutSeconds": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + }, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(-10)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "idleTimeoutSeconds must be non-negative, got -10", + }, + "should accept zero idleTimeoutSeconds": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + }, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(0)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + "should reject idleTimeoutSeconds when autoscaler version is not v2": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV1), + }, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler version is not v2. Please set .spec.autoscalerOptions.version to v2", + }, + "should reject idleTimeoutSeconds when autoscaler version is not set": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler version is not v2. Please set .spec.autoscalerOptions.version to v2", + }, + "should reject idleTimeoutSeconds when AutoscalerOptions is nil": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + AutoscalerOptions: nil, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler version is not v2. Please set .spec.autoscalerOptions.version to v2", + }, + "should accept worker group without idleTimeoutSeconds and without autoscaler v2": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + AutoscalerOptions: &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV1), + }, + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + err := ValidateRayClusterSpec(&tc.spec, nil) + if tc.expectedErr != "" { + require.Error(t, err) + require.EqualError(t, err, tc.expectedErr) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go index afd2d60b16f..b74373268a3 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go @@ -42,6 +42,8 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). WithRayVersion(GetRayVersion()). + WithAutoscalerOptions(rayv1ac.AutoscalerOptions(). + WithVersion(rayv1.AutoscalerVersionV2)). WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). WithRayStartParams(map[string]string{"num-cpus": "0"}). WithTemplate(tc.HeadPodTemplateGetter())).