From 55e89e542aa479398cbdc62ce67c87d8808432a9 Mon Sep 17 00:00:00 2001
From: "Siyu Jiang (See-You John)" <91580504+jsy1218@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:30:30 -0800
Subject: [PATCH] fix: tenderly sim system down as sev2 (#909)

---
 bin/stacks/routing-api-stack.ts | 57 ++++++++++++++++++++++++++-------
 lib/handlers/quote/quote.ts     |  3 ++
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/bin/stacks/routing-api-stack.ts b/bin/stacks/routing-api-stack.ts
index 6ed020739e..30459e59dc 100644
--- a/bin/stacks/routing-api-stack.ts
+++ b/bin/stacks/routing-api-stack.ts
@@ -327,15 +327,12 @@ export class RoutingAPIStack extends cdk.Stack {
       evaluationPeriods: 3,
     })
 
-    // Simulations can fail for valid reasons. For example, if the simulation reverts due
-    // to slippage checks (can happen with FOT tokens sometimes since our quoter does not
-    // account for the fees taken during transfer when we show the user the quote).
-    //
-    // For this reason we only alert on SEV3 to avoid unnecessary pages.
-    const simulationAlarmSev3 = new aws_cloudwatch.Alarm(this, 'RoutingAPI-SEV3-Simulation', {
-      alarmName: 'RoutingAPI-SEV3-Simulation',
+    // Tenderly sim system downtime is sev2, because it's swap blocking from trading-api.
+    // We have confidence that tenderly is down
+    const simulationAlarmSev2 = new aws_cloudwatch.Alarm(this, 'RoutingAPI-SEV2-Simulation', {
+      alarmName: 'RoutingAPI-SEV2-Simulation',
       metric: new MathExpression({
-        expression: '100*(simulationFailed/simulationRequested)',
+        expression: '100*(simulationSystemDown/simulationRequested)',
         period: Duration.minutes(30),
         usingMetrics: {
           simulationRequested: new aws_cloudwatch.Metric({
@@ -345,9 +342,9 @@ export class RoutingAPIStack extends cdk.Stack {
             unit: aws_cloudwatch.Unit.COUNT,
             statistic: 'sum',
           }),
-          simulationFailed: new aws_cloudwatch.Metric({
+          simulationSystemDown: new aws_cloudwatch.Metric({
             namespace: 'Uniswap',
-            metricName: `SimulationFailed`,
+            metricName: `SimulationSystemDown`,
             dimensionsMap: { Service: 'RoutingAPI' },
             unit: aws_cloudwatch.Unit.COUNT,
             statistic: 'sum',
@@ -358,6 +355,41 @@ export class RoutingAPIStack extends cdk.Stack {
       evaluationPeriods: 3,
       treatMissingData: aws_cloudwatch.TreatMissingData.NOT_BREACHING, // Missing data points are treated as "good" and within the threshold
     })
+    const simulationAlarmByChainSev2: cdk.aws_cloudwatch.Alarm[] = []
+    SUPPORTED_CHAINS.forEach((chainId) => {
+      if (CHAINS_NOT_MONITORED.includes(chainId)) {
+        return
+      }
+
+      const simulationAlarmSev2 = new aws_cloudwatch.Alarm(this, `RoutingAPI-SEV2-SimulationChainId${chainId}`, {
+        alarmName: `RoutingAPI-SEV2-SimulationChainId${chainId}`,
+        metric: new MathExpression({
+          expression: `100*(simulationSystemDown/simulationRequested)`,
+          period: Duration.minutes(30),
+          usingMetrics: {
+            simulationRequested: new aws_cloudwatch.Metric({
+              namespace: 'Uniswap',
+              metricName: `Simulation Requested`,
+              dimensionsMap: { Service: 'RoutingAPI' },
+              unit: aws_cloudwatch.Unit.COUNT,
+              statistic: 'sum',
+            }),
+            simulationSystemDown: new aws_cloudwatch.Metric({
+              namespace: 'Uniswap',
+              metricName: `SimulationSystemDownChainId${chainId}`,
+              dimensionsMap: { Service: 'RoutingAPI' },
+              unit: aws_cloudwatch.Unit.COUNT,
+              statistic: 'sum',
+            }),
+          },
+        }),
+        threshold: 20,
+        evaluationPeriods: 3,
+        treatMissingData: aws_cloudwatch.TreatMissingData.NOT_BREACHING, // Missing data points are treated as "good" and within the threshold
+      })
+
+      simulationAlarmByChainSev2.push(simulationAlarmSev2)
+    })
 
     // Create an alarm for when GraphQLTokenFeeFetcherFetchFeesFailure rate goes above 15%.
     // We do have on chain fallback in place of GQL failure, but we want to be alerted if the failure rate is high to take action.
@@ -580,7 +612,7 @@ export class RoutingAPIStack extends cdk.Stack {
       apiAlarm5xxSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
       apiAlarm4xxSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
       apiAlarmLatencySev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
-      simulationAlarmSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
+      simulationAlarmSev2.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
       graphqlTokenFeeFetcherErrorRateSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
 
       percent4XXByChainAlarm.forEach((alarm) => {
@@ -595,6 +627,9 @@ export class RoutingAPIStack extends cdk.Stack {
       successRateByRequestSourceAndChainIdAlarm.forEach((alarm) => {
         alarm.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
       })
+      simulationAlarmByChainSev2.forEach((alarm) => {
+        alarm.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic))
+      })
     }
 
     this.url = new CfnOutput(this, 'Url', {
diff --git a/lib/handlers/quote/quote.ts b/lib/handlers/quote/quote.ts
index 406648364e..d489c20cde 100644
--- a/lib/handlers/quote/quote.ts
+++ b/lib/handlers/quote/quote.ts
@@ -512,6 +512,9 @@ export class QuoteHandler extends APIGLambdaHandler<
       metric.putMetric('SimulationNotApproved', 1, MetricLoggerUnit.Count)
     } else if (simulationStatus == SimulationStatus.NotSupported) {
       metric.putMetric('SimulationNotSupported', 1, MetricLoggerUnit.Count)
+    } else if (simulationStatus == SimulationStatus.SystemDown) {
+      metric.putMetric('SimulationSystemDown', 1, MetricLoggerUnit.Count)
+      metric.putMetric(`SimulationSystemDownChainId${chainId}`, 1, MetricLoggerUnit.Count)
     }
 
     const routeResponse: Array<SupportedPoolInRoute[]> = []