diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimits.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimits.java index 749db4601..57e9ba96b 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimits.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimits.java @@ -165,6 +165,48 @@ int getPodTemplateCount(String podTemplate) { return podTemplateCounts.getOrDefault(podTemplate, 0); } + /** + * Unregister executors using persistent field values (cloudName and templateId). + * This method works correctly even after Jenkins restart when transient template references are null. + * + * @param cloudName the kubernetes cloud name + * @param podTemplateId the pod template ID + * @param numExecutors the number of executors (pretty much always 1) + */ + public void unregisterByIds(@NonNull String cloudName, @NonNull String podTemplateId, int numExecutors) { + if (initInstance()) { + synchronized (this) { + // Decrement each counter independently if it was registered + int currentGlobalCount = getGlobalCount(cloudName); + if (currentGlobalCount > 0) { + int newGlobalCount = currentGlobalCount - numExecutors; + if (newGlobalCount < 0) { + LOGGER.log( + Level.WARNING, + "Global count for " + cloudName + + " went below zero. There is likely a bug in kubernetes-plugin"); + } + cloudCounts.put(cloudName, Math.max(0, newGlobalCount)); + LOGGER.log(Level.FINEST, () -> cloudName + " global limit: " + Math.max(0, newGlobalCount)); + } + + int currentPodTemplateCount = getPodTemplateCount(podTemplateId); + if (currentPodTemplateCount > 0) { + int newPodTemplateCount = currentPodTemplateCount - numExecutors; + if (newPodTemplateCount < 0) { + LOGGER.log( + Level.WARNING, + "Pod template count for " + podTemplateId + + " went below zero. There is likely a bug in kubernetes-plugin"); + } + podTemplateCounts.put(podTemplateId, Math.max(0, newPodTemplateCount)); + LOGGER.log( + Level.FINEST, () -> podTemplateId + " template limit: " + Math.max(0, newPodTemplateCount)); + } + } + } + } + @Extension public static class NodeListenerImpl extends NodeListener { @Override @@ -172,10 +214,10 @@ protected void onDeleted(@NonNull Node node) { if (node instanceof KubernetesSlave) { KubernetesProvisioningLimits instance = KubernetesProvisioningLimits.get(); KubernetesSlave kubernetesNode = (KubernetesSlave) node; - PodTemplate template = kubernetesNode.getTemplateOrNull(); - if (template != null) { - instance.unregister(kubernetesNode.getKubernetesCloud(), template, node.getNumExecutors()); - } + // Use persistent fields (cloudName, templateId) instead of transient template object + // This works correctly even after Jenkins restart when template reference is null + instance.unregisterByIds( + kubernetesNode.getCloudName(), kubernetesNode.getTemplateId(), node.getNumExecutors()); } } } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimitsTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimitsTest.java index 6e42cb797..0107ff70e 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimitsTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimitsTest.java @@ -91,4 +91,51 @@ public void lotsOfCloudsAndTemplates() throws InterruptedException { } } } + + @Test + public void testCounterDecrementAfterRestartWithEphemeralTemplate() throws Exception { + // Create a cloud with an ephemeral template + KubernetesCloud cloud = new KubernetesCloud("test-cloud"); + cloud.setContainerCap(10); + j.jenkins.clouds.add(cloud); + + // Create an ephemeral pod template (not saved to cloud config) + PodTemplate ephemeralTemplate = new PodTemplate(); + ephemeralTemplate.setName("ephemeral-template"); + ephemeralTemplate.setInstanceCap(5); + + // Register the template (simulates agent creation) + KubernetesProvisioningLimits limits = KubernetesProvisioningLimits.get(); + assertTrue("Should successfully register template", limits.register(cloud, ephemeralTemplate, 1)); + + // Get the template ID that was auto-generated + String templateId = ephemeralTemplate.getId(); + + // Verify counters were incremented after registration + assertEquals("Global count should be 1 after registration", 1, limits.getGlobalCount("test-cloud")); + assertEquals("Template count should be 1 after registration", 1, limits.getPodTemplateCount(templateId)); + + // Create a KubernetesSlave using Builder pattern + KubernetesSlave slave = new KubernetesSlave.Builder() + .podTemplate(ephemeralTemplate) + .cloud(cloud) + .nodeDescription("Test agent for counter leak fix") + .build(); + + // Add the slave to Jenkins + j.jenkins.addNode(slave); + + // Simulate ephemeral template removal (happens with idleMinutes config after build completion) + cloud.removeTemplate(ephemeralTemplate); + + // Remove the node (simulates agent deletion) + // The fix ensures counters are decremented using cloudName and templateId, + // even when template reference is null (as happens with ephemeral templates after removal/restart) + j.jenkins.removeNode(slave); + + // After deletion, counters should be decremented back to 0 + // This is the bug fix: should work even when template reference is null + assertEquals("Global count should be 0 after node deletion", 0, limits.getGlobalCount("test-cloud")); + assertEquals("Template count should be 0 after node deletion", 0, limits.getPodTemplateCount(templateId)); + } }