Skip to content
10 changes: 10 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,16 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
return errstd.New(reason)
}
} else if len(headPods.Items) == 0 {
originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey])
if originatedFrom == utils.RayJobCRD {
if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) {
logger.Info(
"reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",
"rayCluster", instance.Name,
)
return nil
}
}
// Create head Pod if it does not exist.
logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.")
if err := r.createHeadPod(ctx, *instance); err != nil {
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/test/e2erayjob/rayjob_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,10 +310,10 @@ env_vars:
g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed)))
g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
Should(WithTransform(RayJobReason, Equal(rayv1.AppFailed)))
Should(WithTransform(RayJobReason, Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded)))
g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium).
Should(WithTransform(func(job *rayv1.RayJob) string { return job.Status.Message },
Equal("Submitter completed but Ray job not found in RayCluster.")))
MatchRegexp("The RayJob submitter finished at .* but the ray job did not reach terminal state within .*")))

// Cleanup
err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{})
Expand Down
Loading