Skip to content

Commit

Permalink
Re-enable ASG scaling before final stabilisation
Browse files Browse the repository at this point in the history
This aims to address, to some extent, issue #1342 -
the problem that *apps can not auto-scale* until an autoscaling deploy has
successfully completed. On 22nd May 2024, this inability to auto-scale led
to a severe outage in the Ophan Tracker.

Ever since #83 in
April 2013, Riff Raff has disabled ASG scaling alarms at the start of a deploy
(`SuspendAlarmNotifications`), and only re-enabled them at the end of the deploy,
(`ResumeAlarmNotifications`) once deployment has successfully completed.

In December 2016, with #403, an
additional `WaitForStabilization` was added as a penultimate deploy step,
with the aim of ensuring that the cull of old instances has _completed_
before the deploy ends. However, the `WaitForStabilization` step was added _before_ `ResumeAlarmNotifications`, rather than _after_, and nothing in the PR description
indicates this was a necessary choice. We can see the argument for it -
obviously, the ASG will be quicker to stabilise if it's not being auto-scaled by
alarms - but if the ASG instances are already overloaded and recycling, the ASG
will _never_ stabilise, because it needs to scale up to handle the load it's
experiencing.

By simply putting the final `WaitForStabilization` step _after_
`ResumeAlarmNotifications`, the Ophan outage would have been shortened from
1 hour to ~2 minutes.

The `WaitForStabilization` step itself simply checks that the number of instances
in the ASG & ELB matches the _desired size_ of the ASG. So long as the ASG is not
scaling up & down very rapidly, we can easily tolerate the ASG scaling up once or
twice, and the `WaitForStabilization` condition will still be easily satisfied,
and the deploy will be reported as success.
  • Loading branch information
rtyley committed Jun 4, 2024
1 parent c001861 commit 753e0e2
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ object AutoScaling extends DeploymentType with BucketParameters {
target.region,
terminationGrace(pkg, target, reporter)
),
ResumeAlarmNotifications(autoScalingGroup, target.region),
WaitForStabilization(
autoScalingGroup,
secondsToWait(pkg, target, reporter),
target.region
),
ResumeAlarmNotifications(autoScalingGroup, target.region)
)
)
}
val groupsToUpdate: List[AutoScalingGroupInfo] =
Expand Down
90 changes: 52 additions & 38 deletions magenta-lib/src/main/scala/magenta/tasks/ASGTasks.scala
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
package magenta.tasks

import magenta.deployment_type.AutoScalingGroupInfo
import magenta.tasks.CullInstancesWithTerminationTag.TerminatingStates
import magenta.{KeyRing, _}
import software.amazon.awssdk.awscore.exception.AwsServiceException
import software.amazon.awssdk.services.autoscaling.AutoScalingClient
import software.amazon.awssdk.services.autoscaling.model.{
AutoScalingGroup,
LifecycleState,
SetInstanceProtectionRequest
}
import software.amazon.awssdk.services.autoscaling.model.{AutoScalingGroup, Instance, LifecycleState, SetInstanceProtectionRequest}
import software.amazon.awssdk.services.ec2.Ec2Client

import java.time.Duration
import scala.jdk.CollectionConverters._
Expand Down Expand Up @@ -207,6 +205,16 @@ case class WaitForStabilization(
s"Check the desired number of hosts in both the ASG ($asgName) and ELB are up and that the number of hosts match"
}

object CullInstancesWithTerminationTag {
// See https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html#lifecycle-hooks-overview
val TerminatingStates: Set[LifecycleState] = Set(
LifecycleState.TERMINATING,
LifecycleState.TERMINATING_WAIT,
LifecycleState.TERMINATING_PROCEED,
LifecycleState.TERMINATED
)
}

case class CullInstancesWithTerminationTag(
info: AutoScalingGroupInfo,
region: Region
Expand All @@ -220,7 +228,7 @@ case class CullInstancesWithTerminationTag(
): Unit = {
EC2.withEc2Client(keyRing, region, resources) { ec2Client =>
ELB.withClient(keyRing, region, resources) { elbClient =>
val allInstances = asg.instances.asScala
val allInstances = asg.instances.asScala.toSeq
resources.reporter.verbose(
s"Found the following instances: ${allInstances.map(_.instanceId).mkString(", ")}"
)
Expand All @@ -234,48 +242,23 @@ case class CullInstancesWithTerminationTag(
)
}

// See https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html#lifecycle-hooks-overview
val terminatingStates = List(
LifecycleState.TERMINATING,
LifecycleState.TERMINATING_WAIT,
LifecycleState.TERMINATING_PROCEED,
LifecycleState.TERMINATED
).map(_.toString)

val isAlreadyTerminating =
terminatingStates.contains(instance.lifecycleStateAsString)
TerminatingStates.contains(instance.lifecycleState)
val isTaggedForTermination =
EC2.hasTag(instance, "Magenta", "Terminate", ec2Client)

isTaggedForTermination && !isAlreadyTerminating
})

val instancesToRetain = allInstances.diff(instancesToKill).toList
resources.reporter.verbose(
s"Decided to keep the following instances: ${instancesToRetain.map(_.instanceId).mkString(", ")}"
reportOnRetainedInstances(
ec2Client,
resources,
allInstances,
instancesToKill
)

if (instancesToRetain.size != instancesToKill.size) {
resources.reporter.warning(
s"Terminating ${instancesToKill.size} instances and retaining ${instancesToRetain.size} instances"
)
logger.warn(
s"Unusual number of instances terminated as part of autoscaling deployment"
)
instancesToRetain.foreach(instanceToRetain => {
val tags = EC2
.allTags(instanceToRetain, ec2Client)
.toList
.map(tag => s"${tag.key}:${tag.value}")
.mkString(", ")
resources.reporter.verbose(
s"Will not terminate $instanceToRetain. State: ${instanceToRetain.lifecycleStateAsString}. Tags: $tags"
)
})
}

val orderedInstancesToKill =
instancesToKill.toSeq.transposeBy(_.availabilityZone)
instancesToKill.transposeBy(_.availabilityZone)
try {
resources.reporter.verbose(
s"Culling instances: ${orderedInstancesToKill.map(_.instanceId).mkString(", ")}"
Expand All @@ -302,6 +285,37 @@ case class CullInstancesWithTerminationTag(
.contains("ValidationError")
}

private def reportOnRetainedInstances(
ec2Client: Ec2Client,
resources: DeploymentResources,
allInstances: Seq[Instance],
instancesToKill: Seq[Instance]
): Unit = {
val instancesToRetain = allInstances.diff(instancesToKill).toList
resources.reporter.verbose(
s"Decided to keep the following instances: ${instancesToRetain.map(_.instanceId).mkString(", ")}"
)

if (instancesToRetain.size != instancesToKill.size) {
resources.reporter.warning(
s"Terminating ${instancesToKill.size} instances and retaining ${instancesToRetain.size} instances"
)
logger.warn(
s"Unusual number of instances terminated as part of autoscaling deployment"
)
instancesToRetain.foreach(instanceToRetain => {
val tags = EC2
.allTags(instanceToRetain, ec2Client)
.toList
.map(tag => s"${tag.key}:${tag.value}")
.mkString(", ")
resources.reporter.verbose(
s"Will not terminate $instanceToRetain. State: ${instanceToRetain.lifecycleStateAsString}. Tags: $tags"
)
})
}
}

lazy val description =
s"Terminate instances in $asgName with the termination tag for this deploy"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
package magenta.tasks

import magenta.artifact.S3Path
import magenta.tasks.CloudFormationParameters.{
ExistingParameter,
InputParameter,
TemplateParameter
}
import magenta.tasks.CloudFormationParameters.{ExistingParameter, InputParameter, TemplateParameter}
import magenta.tasks.UpdateCloudFormationTask._
import magenta.{DeployReporter, DeploymentResources, KeyRing, Region}
import software.amazon.awssdk.services.cloudformation.CloudFormationClient
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ class AutoScalingTest
WaitForStabilization(testInfo, ofMinutes(15), Region("eu-west-1")),
CullInstancesWithTerminationTag(testInfo, Region("eu-west-1")),
TerminationGrace(testInfo, Region("eu-west-1"), ofSeconds(10)),
WaitForStabilization(testInfo, ofMinutes(15), Region("eu-west-1")),
ResumeAlarmNotifications(testInfo, Region("eu-west-1"))
ResumeAlarmNotifications(testInfo, Region("eu-west-1")),
WaitForStabilization(testInfo, ofMinutes(15), Region("eu-west-1"))
)
actual shouldBe expected
}
Expand Down Expand Up @@ -222,12 +222,12 @@ class AutoScalingTest
),
CullInstancesWithTerminationTag(testOldCfnAsg, Region("eu-west-1")),
TerminationGrace(testOldCfnAsg, Region("eu-west-1"), ofSeconds(10)),
ResumeAlarmNotifications(testOldCfnAsg, Region("eu-west-1")),
WaitForStabilization(
testOldCfnAsg,
ofMinutes(15),
Region("eu-west-1")
),
ResumeAlarmNotifications(testOldCfnAsg, Region("eu-west-1")),
// All tasks for testNewCdkAsg
WaitForStabilization(testNewCdkAsg, ofMinutes(5), Region("eu-west-1")),
CheckGroupSize(testNewCdkAsg, Region("eu-west-1")),
Expand All @@ -252,12 +252,12 @@ class AutoScalingTest
),
CullInstancesWithTerminationTag(testNewCdkAsg, Region("eu-west-1")),
TerminationGrace(testNewCdkAsg, Region("eu-west-1"), ofSeconds(10)),
ResumeAlarmNotifications(testNewCdkAsg, Region("eu-west-1")),
WaitForStabilization(
testNewCdkAsg,
ofMinutes(15),
Region("eu-west-1")
),
ResumeAlarmNotifications(testNewCdkAsg, Region("eu-west-1"))
)
)
actual shouldBe expected
}
Expand Down Expand Up @@ -347,8 +347,8 @@ class AutoScalingTest
WaitForStabilization(testInfo, ofMinutes(3), Region("eu-west-1")),
CullInstancesWithTerminationTag(testInfo, Region("eu-west-1")),
TerminationGrace(testInfo, Region("eu-west-1"), ofSeconds(11)),
WaitForStabilization(testInfo, ofMinutes(3), Region("eu-west-1")),
ResumeAlarmNotifications(testInfo, Region("eu-west-1"))
ResumeAlarmNotifications(testInfo, Region("eu-west-1")),
WaitForStabilization(testInfo, ofMinutes(3), Region("eu-west-1"))
)
actual shouldBe expected
}
Expand Down

0 comments on commit 753e0e2

Please sign in to comment.