From 767008133874c3241a823f29257b03fcb30ff432 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 18 Sep 2025 17:38:52 +0800 Subject: [PATCH] fix: ensure graceful shutdown for Ray head pods when GCS fault tolerance is enabled Signed-off-by: Future-Outlier --- ray-operator/controllers/ray/common/pod.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 2b5aae59e3e..9bbe05b8afc 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -85,6 +85,20 @@ func configureGCSFaultTolerance(podTemplate *corev1.PodTemplateSpec, instance ra options := instance.Spec.GcsFaultToleranceOptions container := &podTemplate.Spec.Containers[utils.RayContainerIndex] + // Configure lifecycle preStop hook for graceful shutdown when GCS FT is enabled + if rayNodeType == rayv1.HeadNode { + if container.Lifecycle == nil { + container.Lifecycle = &corev1.Lifecycle{} + } + if container.Lifecycle.PreStop == nil { + container.Lifecycle.PreStop = &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/sh", "-c", "ray stop --force"}, + }, + } + } + } + // Configure the GCS RPC server reconnect timeout for GCS FT. if !utils.EnvVarExists(utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, container.Env) && rayNodeType == rayv1.WorkerNode { // If GCS FT is enabled and RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S is not set, set the worker's