diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index 540af1782..b9f767d1c 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -600,9 +600,17 @@ public void onEvent( List backOffContainers = PodUtils.getContainers(pod, cs -> { ContainerStateWaiting waiting = cs.getState().getWaiting(); - return waiting != null - && waiting.getMessage() != null - && waiting.getMessage().contains("Back-off pulling image"); + if (waiting == null) { + return false; + } + // Check reason field first as it's the primary indicator set by Kubernetes + String reason = waiting.getReason(); + if (reason != null && reason.equalsIgnoreCase(IMAGE_PULL_BACK_OFF)) { + return true; + } + // Fall back to checking message field for backward compatibility + String message = waiting.getMessage(); + return message != null && message.contains("Back-off pulling image"); }); if (!backOffContainers.isEmpty()) { @@ -611,6 +619,12 @@ public void onEvent( var podUid = pod.getMetadata().getUid(); var backOffNumber = ttlCache.get(podUid, k -> 0); ttlCache.put(podUid, ++backOffNumber); + final int currentBackOffNumber = backOffNumber; + LOGGER.log( + Level.FINE, + () -> "ImagePullBackOff detected for pod " + + pod.getMetadata().getName() + " (event " + currentBackOffNumber + "/" + + BACKOFF_EVENTS_LIMIT + ")"); if (backOffNumber >= BACKOFF_EVENTS_LIMIT) { var imagesString = String.join(",", images); node.getRunListener() diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java index 5aaebdb99..181fb2cb0 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java @@ -673,6 +673,67 @@ private Pod withContainerImagePullBackoff(Pod pod) { return pod; } + private Pod withContainerImagePullBackoffReasonOnly(Pod pod) { + // Simulates real Kubernetes behavior where only the reason field is set + ContainerStatus status = new ContainerStatusBuilder() + .withNewState() + .withNewWaiting(null, "ImagePullBackOff") + .endState() + .build(); + pod.getStatus().getContainerStatuses().add(status); + return pod; + } + + @Test(timeout = 10_000) + public void testTerminateAgentOnImagePullBackoffReasonFieldOnly() throws IOException, InterruptedException { + // Test the scenario reported in #2772 where only the reason field contains "ImagePullBackOff" + KubernetesCloud cloud = addCloud("k8s", "foo"); + KubernetesSlave node = addNode(cloud, "node-124", "node"); + Pod node124 = withContainerImagePullBackoffReasonOnly(createPod(node)); + Reaper.TerminateAgentOnImagePullBackOff.BACKOFF_EVENTS_LIMIT = 2; + + String watchPodsPath = "/api/v1/namespaces/foo/pods?allowWatchBookmarks=true&watch=true"; + server.expect() + .withPath(watchPodsPath) + .andUpgradeToWebSocket() + .open() + .waitFor(EVENT_WAIT_PERIOD_MS) + .andEmit(new WatchEvent(node124, "MODIFIED")) + .waitFor(EVENT_WAIT_PERIOD_MS) + .andEmit(new WatchEvent(node124, "BOOKMARK")) + .waitFor(EVENT_WAIT_PERIOD_MS) + .andEmit(new WatchEvent(node124, "MODIFIED")) + .waitFor(EVENT_WAIT_PERIOD_MS) + .andEmit(new WatchEvent(node124, "BOOKMARK")) + .done() + .always(); + // don't remove pod on activate + server.expect() + .withPath("/api/v1/namespaces/foo/pods/node-124") + .andReturn(200, node124) + .once(); + + // activate reaper + Reaper r = Reaper.getInstance(); + r.maybeActivate(); + + // verify node is still registered + assertEquals("jenkins nodes", j.jenkins.getNodes().size(), 1); + + // wait for the delete event to be processed + waitForKubeClientRequests(6).assertRequestCountAtLeast(watchPodsPath, 3); + + // verify listener got notified + listener.expectEvent(Watcher.Action.MODIFIED, node); + + // expect node to be terminated + verify(node, atLeastOnce()).terminate(); + // verify computer disconnected with offline cause + verify(node.getComputer(), atLeastOnce()).disconnect(org.mockito.ArgumentMatchers.any(PodOfflineCause.class)); + // verify node is still registered (will be removed when pod deleted) + assertEquals("jenkins nodes", j.jenkins.getNodes().size(), 1); + } + private Pod withContainerStatusTerminated(Pod pod) { PodStatus podStatus = pod.getStatus(); podStatus