Skip to content

Commit

Permalink
#76 Don't provision additional capacity until node will be online (#109)
Browse files Browse the repository at this point in the history
#76 Don't provision additional capacity until node will be online
  • Loading branch information
terma authored Jul 2, 2019
1 parent 7898f23 commit 126036d
Show file tree
Hide file tree
Showing 14 changed files with 799 additions and 427 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
import hudson.slaves.DelegatingComputerLauncher;
import hudson.slaves.OfflineCause;
import hudson.slaves.SlaveComputer;
import hudson.util.StreamTaskListener;
import jenkins.model.CauseOfInterruption;

import javax.annotation.concurrent.ThreadSafe;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
Expand All @@ -30,6 +29,7 @@
* @see EC2FleetNode
* @see EC2FleetNodeComputer
*/
@ThreadSafe
public class EC2FleetAutoResubmitComputerLauncher extends DelegatingComputerLauncher {

private static final Level LOG_LEVEL = Level.INFO;
Expand Down
92 changes: 62 additions & 30 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetCloud.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.amazon.jenkins.ec2fleet;

import com.amazonaws.services.ec2.AmazonEC2;
import com.amazonaws.regions.RegionUtils;
import com.amazonaws.services.ec2.AmazonEC2;
import com.amazonaws.services.ec2.model.BatchState;
import com.amazonaws.services.ec2.model.DescribeInstancesRequest;
import com.amazonaws.services.ec2.model.DescribeInstancesResult;
Expand Down Expand Up @@ -50,6 +50,7 @@
import java.util.TreeSet;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
Expand All @@ -62,6 +63,9 @@ public class EC2FleetCloud extends Cloud {

public static final String FLEET_CLOUD_ID = "FleetCloud";

private static final int DEFAULT_INIT_ONLINE_TIMEOUT_SEC = 3 * 60;
private static final int DEFAULT_INIT_ONLINE_CHECK_INTERVAL_SEC = 15;

private static final SimpleFormatter sf = new SimpleFormatter();
private static final Logger LOGGER = Logger.getLogger(EC2FleetCloud.class.getName());

Expand Down Expand Up @@ -97,6 +101,8 @@ public class EC2FleetCloud extends Cloud {
private final Integer numExecutors;
private final boolean addNodeOnlyIfRunning;
private final boolean restrictUsage;
private final Integer initOnlineTimeoutSec;
private final Integer initOnlineCheckIntervalSec;

/**
* @see EC2FleetAutoResubmitComputerLauncher
Expand Down Expand Up @@ -127,7 +133,9 @@ public EC2FleetCloud(final String name,
final Integer numExecutors,
final boolean addNodeOnlyIfRunning,
final boolean restrictUsage,
final boolean disableTaskResubmit) {
final boolean disableTaskResubmit,
final Integer initOnlineTimeoutSec,
final Integer initOnlineCheckIntervalSec) {
super(StringUtils.isBlank(name) ? FLEET_CLOUD_ID : name);
initCaches();
this.credentialsId = credentialsId;
Expand All @@ -147,6 +155,8 @@ public EC2FleetCloud(final String name,
this.addNodeOnlyIfRunning = addNodeOnlyIfRunning;
this.restrictUsage = restrictUsage;
this.disableTaskResubmit = disableTaskResubmit;
this.initOnlineTimeoutSec = initOnlineTimeoutSec;
this.initOnlineCheckIntervalSec = initOnlineCheckIntervalSec;
}

/**
Expand All @@ -159,7 +169,17 @@ public String getAwsCredentialsId() {
return StringUtils.isNotBlank(awsCredentialsId) ? awsCredentialsId : credentialsId;
}

public boolean isDisableTaskResubmit() { return disableTaskResubmit; }
public boolean isDisableTaskResubmit() {
return disableTaskResubmit;
}

public int getInitOnlineTimeoutSec() {
return initOnlineTimeoutSec == null ? DEFAULT_INIT_ONLINE_TIMEOUT_SEC : initOnlineTimeoutSec;
}

public int getInitOnlineCheckIntervalSec() {
return initOnlineCheckIntervalSec == null ? DEFAULT_INIT_ONLINE_CHECK_INTERVAL_SEC : initOnlineCheckIntervalSec;
}

public String getRegion() {
return region;
Expand Down Expand Up @@ -272,10 +292,9 @@ public synchronized Collection<NodeProvisioner.PlannedNode> provisionInternal(

final List<NodeProvisioner.PlannedNode> resultList = new ArrayList<>();
for (int f = 0; f < toProvision; ++f) {
final SettableFuture<Node> futureNode = SettableFuture.create();
// todo make name unique per fleet
final NodeProvisioner.PlannedNode plannedNode = new NodeProvisioner.PlannedNode(
"FleetNode-" + f, futureNode, this.numExecutors);
"FleetNode-" + f, SettableFuture.<Node>create(), this.numExecutors);
resultList.add(plannedNode);
this.plannedNodesCache.add(plannedNode);
}
Expand Down Expand Up @@ -359,7 +378,7 @@ public synchronized FleetStateStats updateStatus() {
// If we have new instances - create nodes for them!
try {
if (newFleetInstances.size() > 0) {
info("Found new instances from fleet (" + getLabelString() + "): [" +
info("Found new instances [" +
StringUtils.join(newFleetInstances, ", ") + "]");
}
for (final String instanceId : newFleetInstances) {
Expand Down Expand Up @@ -458,13 +477,13 @@ private synchronized void removeNode(String instanceId) {
}
}

/**
* https://github.com/jenkinsci/ec2-plugin/blob/master/src/main/java/hudson/plugins/ec2/EC2Cloud.java#L640
*
* @param ec2 ec2 client
* @param instanceId instance id
*/
private void addNewSlave(final AmazonEC2 ec2, final String instanceId) throws Exception {
// Generate a random FS root if one isn't specified
String effectiveFsRoot = fsRoot;
if (StringUtils.isBlank(effectiveFsRoot)) {
effectiveFsRoot = "/tmp/jenkins-" + UUID.randomUUID().toString().substring(0, 8);
}

final DescribeInstancesResult result = ec2.describeInstances(
new DescribeInstancesRequest().withInstanceIds(instanceId));
//Can't find this instance, skip it
Expand All @@ -480,50 +499,63 @@ private void addNewSlave(final AmazonEC2 ec2, final String instanceId) throws Ex
// Check if we have the address to use. Nodes don't get it immediately.
if (address == null) return; // Wait some more...

// Generate a random FS root if one isn't specified
String effectiveFsRoot = fsRoot;
if (StringUtils.isBlank(effectiveFsRoot)) {
effectiveFsRoot = "/tmp/jenkins-" + UUID.randomUUID().toString().substring(0, 8);
}

final EC2FleetAutoResubmitComputerLauncher computerLauncher = new EC2FleetAutoResubmitComputerLauncher(
computerConnector.launch(address, TaskListener.NULL), disableTaskResubmit);
final Node.Mode nodeMode = restrictUsage ? Node.Mode.EXCLUSIVE : Node.Mode.NORMAL;
final EC2FleetNode slave = new EC2FleetNode(instanceId, "Fleet slave for " + instanceId,
final EC2FleetNode node = new EC2FleetNode(instanceId, "Fleet slave for " + instanceId,
effectiveFsRoot, numExecutors.toString(), nodeMode, labelString, new ArrayList<NodeProperty<?>>(),
name, computerLauncher);

// Initialize our retention strategy
slave.setRetentionStrategy(new IdleRetentionStrategy(this));
node.setRetentionStrategy(new IdleRetentionStrategy(this));

final Jenkins jenkins = Jenkins.getInstance();
//noinspection SynchronizationOnLocalVariableOrMethodParameter
synchronized (jenkins) {
// Try to avoid duplicate nodes
final Node n = jenkins.getNode(instanceId);
if (n != null)
jenkins.removeNode(n);
jenkins.addNode(slave);
// jenkins automatically remove old node with same name if any
jenkins.addNode(node);
}

// A new node, wheee!
if (!plannedNodesCache.isEmpty()) {
//If we're waiting for a new node - mark it as ready
final NodeProvisioner.PlannedNode curNode = plannedNodesCache.iterator().next();
plannedNodesCache.remove(curNode);
((SettableFuture<Node>) curNode.future).set(slave);
info("set slave %s to planned node", slave.getNodeName());
final SettableFuture<Node> future;
if (plannedNodesCache.isEmpty()) {
future = SettableFuture.create();
} else {
final NodeProvisioner.PlannedNode plannedNode = plannedNodesCache.iterator().next();
plannedNodesCache.remove(plannedNode);
future = ((SettableFuture<Node>) plannedNode.future);
}

// use getters for timeout and interval as they provide default value
// when user just install new version and did't recreate fleet
EC2FleetOnlineChecker.start(node, future,
TimeUnit.SECONDS.toMillis(getInitOnlineTimeoutSec()),
TimeUnit.SECONDS.toMillis(getInitOnlineCheckIntervalSec()));
}

private String getLogPrefix() {
return getDisplayName() + " [" + getLabelString() + "] ";
}

private void info(final String msg, final Object... args) {
LOGGER.info("fleet = " + getDisplayName() + " label = " + getLabelString() + " " + String.format(msg, args));
LOGGER.info(getLogPrefix() + String.format(msg, args));
}

private void fine(final String msg, final Object... args) {
LOGGER.fine("fleet = " + getDisplayName() + " label = " + getLabelString() + " " + String.format(msg, args));
LOGGER.fine(getLogPrefix() + String.format(msg, args));
}

private void warning(final String msg, final Object... args) {
LOGGER.warning("fleet = " + getDisplayName() + " label = " + getLabelString() + " " + String.format(msg, args));
LOGGER.warning(getLogPrefix() + String.format(msg, args));
}

private void warning(final Throwable t, final String msg, final Object... args) {
LOGGER.log(Level.WARNING, "fleet = " + getDisplayName() + " label = " + getLabelString() + " " + String.format(msg, args), t);
LOGGER.log(Level.WARNING, getLogPrefix() + String.format(msg, args), t);
}

@Extension
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package com.amazon.jenkins.ec2fleet;

import com.google.common.util.concurrent.SettableFuture;
import hudson.model.Computer;
import hudson.model.Node;
import hudson.util.DaemonThreadFactory;

import javax.annotation.concurrent.ThreadSafe;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* Keep {@link hudson.slaves.NodeProvisioner.PlannedNode#future} not resolved until node will not be online
* or timeout reached.
* <p>
* Default Jenkins node capacity planner {@link hudson.slaves.NodeProvisioner.Strategy} count planned nodes
* as available capacity, but exclude offline computers {@link Computer#isOnline()} from available capacity.
* Because EC2 instance requires some time when it was added into fleet to start up, next situation happens:
* plugin add described capacity as node into Jenkins pool, but Jenkins keeps it as offline as no way to connect,
* during time when node is offline, Jenkins will try to request more nodes from plugin as offline nodes
* excluded from capacity.
* <p>
* This class fix this situation and keep planned node until instance is really online, so Jenkins planner
* count planned node as available capacity and doesn't request more.
* <p>
* Based on https://github.com/jenkinsci/ec2-plugin/blob/master/src/main/java/hudson/plugins/ec2/EC2Cloud.java#L640
*
* @see EC2FleetCloud
* @see EC2FleetNode
*/
@SuppressWarnings("WeakerAccess")
@ThreadSafe
class EC2FleetOnlineChecker implements Runnable {

private static final Logger LOGGER = Logger.getLogger(EC2FleetOnlineChecker.class.getName());
// use daemon thread, so no problem when stop jenkins
private static final ScheduledExecutorService EXECUTOR = Executors.newSingleThreadScheduledExecutor(new DaemonThreadFactory());

public static void start(final Node node, final SettableFuture<Node> future, final long timeout, final long interval) {
EXECUTOR.execute(new EC2FleetOnlineChecker(node, future, timeout, interval));
}

private final long start;
private final Node node;
private final SettableFuture<Node> future;
private final long timeout;
private final long interval;

private EC2FleetOnlineChecker(
final Node node, final SettableFuture<Node> future, final long timeout, final long interval) {
this.start = System.currentTimeMillis();
this.node = node;
this.future = future;
this.timeout = timeout;
this.interval = interval;
}

@Override
public void run() {
if (future.isCancelled()) {
return;
}

if (timeout < 1) {
future.set(node);
LOGGER.log(Level.INFO, String.format("%s connection check disabled, resolve planned node", node.getNodeName()));
return;
}

if (System.currentTimeMillis() - start > timeout) {
future.setException(new IllegalStateException(
"Fail to provision node, cannot connect to " + node.getNodeName() + " in " + timeout + " msec"));
return;
}

final Computer computer = node.toComputer();
if (computer != null) {
if (computer.isOnline()) {
future.set(node);
LOGGER.log(Level.INFO, String.format("%s connected, resolve planned node", node.getNodeName()));
return;
}
}

LOGGER.log(Level.INFO, String.format("%s no connection, wait before retry", node.getNodeName()));
EXECUTOR.schedule(this, interval, TimeUnit.MILLISECONDS);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hudson.slaves.RetentionStrategy;
import hudson.slaves.SlaveComputer;

import javax.annotation.concurrent.GuardedBy;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand All @@ -29,6 +30,13 @@ public IdleRetentionStrategy(final EC2FleetCloud cloud) {
this.cloud = cloud;
}

/**
* Will be called under {@link hudson.model.Queue#withLock(Runnable)}
*
* @param c computer
* @return delay in min before next run
*/
@GuardedBy("Queue.withLock")
@Override
public long check(final SlaveComputer c) {
// Ensure that the EC2FleetCloud cannot be mutated from under us while
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@
<f:entry title="${%Disable build resubmit}" field="disableTaskResubmit">
<f:checkbox />
</f:entry>

<f:description>Maximum time to wait for EC2 instance startup</f:description>
<f:entry title="${%Maximum Init Connection Timeout in sec}" field="initOnlineTimeoutSec">
<f:number clazz="required positive-number" default="180" />
</f:entry>
</f:section>

</j:jelly>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Specify maximum time which Jenkins will wait for EC2 instance startup before
trying to request new instance from fleet.

<p>
By default Jenkins expect that node will be available immediately, however EC2 instance
requires some time to be provision, start and be ready for actual builds. In result
Jenkins might request more capacity then actually required (over-provision). This setting
avoids over-provision by force Jenkins to wait until EC2 instance will be online and
don't request more capacity.
</p>

<p>
In case of positive value, will force Jenkins to wait when EC2 instance will be online,
no more then specified time in seconds. Cannot be negative.
In case of <code>0</code>, timeout will be disabled and Jenkins will
not wait any time to start up EC2 instance, which could lead to over-provision.
</p>
<p>
<b>Note</b> Be careful with very big values (more 5 min) for this settings. In case of any problem
with node (no java, wrong version of java, Jenkins agent cannot start), so it cannot be online.
Jenkins will wait, without requesting additional capacity.
</p>
<p>
Behavior of previous versions of plugin could be represented as this version
with timeout set to <code>0</code>.
</p>
Loading

0 comments on commit 126036d

Please sign in to comment.