-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Fix counter leak when Jenkins restarts with ephemeral templates #2787
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
97ad743
ee2293b
88d704b
2728ae9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -165,17 +165,58 @@ int getPodTemplateCount(String podTemplate) { | |
| return podTemplateCounts.getOrDefault(podTemplate, 0); | ||
| } | ||
|
|
||
| /** | ||
| * Unregister executors using persistent field values (cloudName and templateId). | ||
| * This method works correctly even after Jenkins restart when transient template references are null. | ||
| * | ||
| * @param cloudName the kubernetes cloud name | ||
| * @param podTemplateId the pod template ID | ||
| * @param numExecutors the number of executors (pretty much always 1) | ||
| */ | ||
| public void unregisterByIds(@NonNull String cloudName, @NonNull String podTemplateId, int numExecutors) { | ||
| if (initInstance()) { | ||
| synchronized (this) { | ||
| // Only unregister if the counts exist (node was actually registered) | ||
| int currentGlobalCount = getGlobalCount(cloudName); | ||
| int currentPodTemplateCount = getPodTemplateCount(podTemplateId); | ||
|
|
||
| if (currentGlobalCount > 0 || currentPodTemplateCount > 0) { | ||
| int newGlobalCount = currentGlobalCount - numExecutors; | ||
| if (newGlobalCount < 0) { | ||
| LOGGER.log( | ||
| Level.WARNING, | ||
| "Global count for " + cloudName | ||
| + " went below zero. There is likely a bug in kubernetes-plugin"); | ||
| } | ||
| cloudCounts.put(cloudName, Math.max(0, newGlobalCount)); | ||
| LOGGER.log(Level.FINEST, () -> cloudName + " global limit: " + Math.max(0, newGlobalCount)); | ||
|
|
||
| int newPodTemplateCount = currentPodTemplateCount - numExecutors; | ||
| if (newPodTemplateCount < 0) { | ||
| LOGGER.log( | ||
| Level.WARNING, | ||
| "Pod template count for " + podTemplateId | ||
| + " went below zero. There is likely a bug in kubernetes-plugin"); | ||
| } | ||
| podTemplateCounts.put(podTemplateId, Math.max(0, newPodTemplateCount)); | ||
| LOGGER.log( | ||
| Level.FINEST, () -> podTemplateId + " template limit: " + Math.max(0, newPodTemplateCount)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick:
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point on the logging consistency. I'll add the containerCap to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see changes on that front, is this expected ? |
||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Extension | ||
| public static class NodeListenerImpl extends NodeListener { | ||
| @Override | ||
| protected void onDeleted(@NonNull Node node) { | ||
| if (node instanceof KubernetesSlave) { | ||
| KubernetesProvisioningLimits instance = KubernetesProvisioningLimits.get(); | ||
| KubernetesSlave kubernetesNode = (KubernetesSlave) node; | ||
| PodTemplate template = kubernetesNode.getTemplateOrNull(); | ||
| if (template != null) { | ||
| instance.unregister(kubernetesNode.getKubernetesCloud(), template, node.getNumExecutors()); | ||
| } | ||
| // Use persistent fields (cloudName, templateId) instead of transient template object | ||
| // This works correctly even after Jenkins restart when template reference is null | ||
| instance.unregisterByIds( | ||
| kubernetesNode.getCloudName(), kubernetesNode.getTemplateId(), node.getNumExecutors()); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,4 +91,48 @@ public void lotsOfCloudsAndTemplates() throws InterruptedException { | |
| } | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testCounterDecrementAfterRestartWithEphemeralTemplate() throws Exception { | ||
| // Create a cloud with an ephemeral template | ||
| KubernetesCloud cloud = new KubernetesCloud("test-cloud"); | ||
| cloud.setContainerCap(10); | ||
| j.jenkins.clouds.add(cloud); | ||
|
|
||
| // Create an ephemeral pod template (not saved to cloud config) | ||
| PodTemplate ephemeralTemplate = new PodTemplate(); | ||
| ephemeralTemplate.setName("ephemeral-template"); | ||
| ephemeralTemplate.setInstanceCap(5); | ||
|
|
||
| // Register the template (simulates agent creation) | ||
| KubernetesProvisioningLimits limits = KubernetesProvisioningLimits.get(); | ||
| assertTrue("Should successfully register template", limits.register(cloud, ephemeralTemplate, 1)); | ||
|
|
||
| // Get the template ID that was auto-generated | ||
| String templateId = ephemeralTemplate.getId(); | ||
|
|
||
| // Verify counters were incremented after registration | ||
| assertEquals("Global count should be 1 after registration", 1, limits.getGlobalCount("test-cloud")); | ||
| assertEquals("Template count should be 1 after registration", 1, limits.getPodTemplateCount(templateId)); | ||
|
|
||
| // Create a KubernetesSlave using Builder pattern | ||
| KubernetesSlave slave = new KubernetesSlave.Builder() | ||
| .podTemplate(ephemeralTemplate) | ||
| .cloud(cloud) | ||
| .nodeDescription("Test agent for counter leak fix") | ||
| .build(); | ||
|
|
||
| // Add the slave to Jenkins | ||
| j.jenkins.addNode(slave); | ||
|
|
||
| // Remove the node (simulates agent deletion) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the suggestion! I'll look into using RestartableJenkinsRule There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling |
||
| // The fix ensures counters are decremented using cloudName and templateId, | ||
| // even when template reference is null (as happens with ephemeral templates after restart) | ||
| j.jenkins.removeNode(slave); | ||
|
|
||
| // After deletion, counters should be decremented back to 0 | ||
| // This is the bug fix: should work even when template reference is null | ||
| assertEquals("Global count should be 0 after node deletion", 0, limits.getGlobalCount("test-cloud")); | ||
| assertEquals("Template count should be 0 after node deletion", 0, limits.getPodTemplateCount(templateId)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please consider decrement each counter independently, like different template but same cloud
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right - we should check each counter independently.
I'll update this to:
Will push the fix shortly.