Skip to content
8 changes: 8 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,14 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
return errstd.New(reason)
}
} else if len(headPods.Items) == 0 {
originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey])
if originatedFrom == utils.RayJobCRD {
logger.Info(
"reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't this cause no head pod to be created at all? We still need to create the first head pod. I think you can check the RayClusterProvisioned condition to decide whether to create one or not.

"rayCluster", instance.Name,
)
return nil
}
// Create head Pod if it does not exist.
logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.")
if err := r.createHeadPod(ctx, *instance); err != nil {
Expand Down
17 changes: 17 additions & 0 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,23 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
}
}

if rayJobInstance.Spec.SubmissionMode != rayv1.SidecarMode &&
rayJobInstance.Status.JobStatus == rayv1.JobStatusRunning &&
finishedAt != nil {
headPod, err := common.GetRayClusterHeadPod(ctx, r.Client, rayClusterInstance)
if err != nil {
logger.Error(err, "Failed to get head pod for RayCluster")
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}
if headPod == nil {
rayJobInstance.Status.JobDeploymentStatus = rayv1.JobDeploymentStatusFailed
rayJobInstance.Status.JobStatus = rayv1.JobStatusFailed
rayJobInstance.Status.Reason = rayv1.AppFailed
rayJobInstance.Status.Message = "Submitter completed but Ray job not found in RayCluster."
break
}
}

// Check the current status of ray jobs
rayDashboardClient, err := r.dashboardClientFunc(rayClusterInstance, rayJobInstance.Status.DashboardURL)
if err != nil {
Expand Down
Loading