Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -600,9 +600,17 @@ public void onEvent(

List<ContainerStatus> backOffContainers = PodUtils.getContainers(pod, cs -> {
ContainerStateWaiting waiting = cs.getState().getWaiting();
return waiting != null
&& waiting.getMessage() != null
&& waiting.getMessage().contains("Back-off pulling image");
Comment on lines -603 to -605
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Introduced originally in #772

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review @Vlatombe! Yes, I saw that #772 originally introduced this logic. The issue is that the code only checked the message field, but in some Kubernetes versions, ImagePullBackOff details appear in the reason field first. My fix checks both fields to ensure backward compatibility while catching all ImagePullBackOff scenarios.

if (waiting == null) {
return false;
}
// Check reason field first as it's the primary indicator set by Kubernetes
String reason = waiting.getReason();
if (reason != null && reason.equalsIgnoreCase(IMAGE_PULL_BACK_OFF)) {
return true;
}
// Fall back to checking message field for backward compatibility
String message = waiting.getMessage();
return message != null && message.contains("Back-off pulling image");
});

if (!backOffContainers.isEmpty()) {
Expand All @@ -611,6 +619,12 @@ public void onEvent(
var podUid = pod.getMetadata().getUid();
var backOffNumber = ttlCache.get(podUid, k -> 0);
ttlCache.put(podUid, ++backOffNumber);
final int currentBackOffNumber = backOffNumber;
LOGGER.log(
Level.FINE,
() -> "ImagePullBackOff detected for pod "
+ pod.getMetadata().getName() + " (event " + currentBackOffNumber + "/"
+ BACKOFF_EVENTS_LIMIT + ")");
if (backOffNumber >= BACKOFF_EVENTS_LIMIT) {
var imagesString = String.join(",", images);
node.getRunListener()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,67 @@ private Pod withContainerImagePullBackoff(Pod pod) {
return pod;
}

private Pod withContainerImagePullBackoffReasonOnly(Pod pod) {
// Simulates real Kubernetes behavior where only the reason field is set
ContainerStatus status = new ContainerStatusBuilder()
.withNewState()
.withNewWaiting(null, "ImagePullBackOff")
.endState()
.build();
pod.getStatus().getContainerStatuses().add(status);
return pod;
}

@Test(timeout = 10_000)
public void testTerminateAgentOnImagePullBackoffReasonFieldOnly() throws IOException, InterruptedException {
// Test the scenario reported in #2772 where only the reason field contains "ImagePullBackOff"
KubernetesCloud cloud = addCloud("k8s", "foo");
KubernetesSlave node = addNode(cloud, "node-124", "node");
Pod node124 = withContainerImagePullBackoffReasonOnly(createPod(node));
Reaper.TerminateAgentOnImagePullBackOff.BACKOFF_EVENTS_LIMIT = 2;

String watchPodsPath = "/api/v1/namespaces/foo/pods?allowWatchBookmarks=true&watch=true";
server.expect()
.withPath(watchPodsPath)
.andUpgradeToWebSocket()
.open()
.waitFor(EVENT_WAIT_PERIOD_MS)
.andEmit(new WatchEvent(node124, "MODIFIED"))
.waitFor(EVENT_WAIT_PERIOD_MS)
.andEmit(new WatchEvent(node124, "BOOKMARK"))
.waitFor(EVENT_WAIT_PERIOD_MS)
.andEmit(new WatchEvent(node124, "MODIFIED"))
.waitFor(EVENT_WAIT_PERIOD_MS)
.andEmit(new WatchEvent(node124, "BOOKMARK"))
.done()
.always();
// don't remove pod on activate
server.expect()
.withPath("/api/v1/namespaces/foo/pods/node-124")
.andReturn(200, node124)
.once();

// activate reaper
Reaper r = Reaper.getInstance();
r.maybeActivate();

// verify node is still registered
assertEquals("jenkins nodes", j.jenkins.getNodes().size(), 1);

// wait for the delete event to be processed
waitForKubeClientRequests(6).assertRequestCountAtLeast(watchPodsPath, 3);

// verify listener got notified
listener.expectEvent(Watcher.Action.MODIFIED, node);

// expect node to be terminated
verify(node, atLeastOnce()).terminate();
// verify computer disconnected with offline cause
verify(node.getComputer(), atLeastOnce()).disconnect(org.mockito.ArgumentMatchers.any(PodOfflineCause.class));
// verify node is still registered (will be removed when pod deleted)
assertEquals("jenkins nodes", j.jenkins.getNodes().size(), 1);
}

private Pod withContainerStatusTerminated(Pod pod) {
PodStatus podStatus = pod.getStatus();
podStatus
Expand Down
Loading