diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 32d730db368..18b57f19922 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -619,6 +619,16 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv return errstd.New(reason) } } else if len(headPods.Items) == 0 { + originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey]) + if originatedFrom == utils.RayJobCRD { + if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { + logger.Info( + "reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure", + "rayCluster", instance.Name, + ) + return nil + } + } // Create head Pod if it does not exist. logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.") if err := r.createHeadPod(ctx, *instance); err != nil { diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 70de0b8718c..35a06cd16b7 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -310,10 +310,10 @@ env_vars: g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(RayJobReason, Equal(rayv1.AppFailed))) + Should(WithTransform(RayJobReason, Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(func(job *rayv1.RayJob) string { return job.Status.Message }, - Equal("Submitter completed but Ray job not found in RayCluster."))) + MatchRegexp("The RayJob submitter finished at .* but the ray job did not reach terminal state within .*"))) // Cleanup err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{})