From 55e89e542aa479398cbdc62ce67c87d8808432a9 Mon Sep 17 00:00:00 2001 From: "Siyu Jiang (See-You John)" <91580504+jsy1218@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:30:30 -0800 Subject: [PATCH] fix: tenderly sim system down as sev2 (#909) --- bin/stacks/routing-api-stack.ts | 57 ++++++++++++++++++++++++++------- lib/handlers/quote/quote.ts | 3 ++ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/bin/stacks/routing-api-stack.ts b/bin/stacks/routing-api-stack.ts index 6ed020739e..30459e59dc 100644 --- a/bin/stacks/routing-api-stack.ts +++ b/bin/stacks/routing-api-stack.ts @@ -327,15 +327,12 @@ export class RoutingAPIStack extends cdk.Stack { evaluationPeriods: 3, }) - // Simulations can fail for valid reasons. For example, if the simulation reverts due - // to slippage checks (can happen with FOT tokens sometimes since our quoter does not - // account for the fees taken during transfer when we show the user the quote). - // - // For this reason we only alert on SEV3 to avoid unnecessary pages. - const simulationAlarmSev3 = new aws_cloudwatch.Alarm(this, 'RoutingAPI-SEV3-Simulation', { - alarmName: 'RoutingAPI-SEV3-Simulation', + // Tenderly sim system downtime is sev2, because it's swap blocking from trading-api. + // We have confidence that tenderly is down + const simulationAlarmSev2 = new aws_cloudwatch.Alarm(this, 'RoutingAPI-SEV2-Simulation', { + alarmName: 'RoutingAPI-SEV2-Simulation', metric: new MathExpression({ - expression: '100*(simulationFailed/simulationRequested)', + expression: '100*(simulationSystemDown/simulationRequested)', period: Duration.minutes(30), usingMetrics: { simulationRequested: new aws_cloudwatch.Metric({ @@ -345,9 +342,9 @@ export class RoutingAPIStack extends cdk.Stack { unit: aws_cloudwatch.Unit.COUNT, statistic: 'sum', }), - simulationFailed: new aws_cloudwatch.Metric({ + simulationSystemDown: new aws_cloudwatch.Metric({ namespace: 'Uniswap', - metricName: `SimulationFailed`, + metricName: `SimulationSystemDown`, dimensionsMap: { Service: 'RoutingAPI' }, unit: aws_cloudwatch.Unit.COUNT, statistic: 'sum', @@ -358,6 +355,41 @@ export class RoutingAPIStack extends cdk.Stack { evaluationPeriods: 3, treatMissingData: aws_cloudwatch.TreatMissingData.NOT_BREACHING, // Missing data points are treated as "good" and within the threshold }) + const simulationAlarmByChainSev2: cdk.aws_cloudwatch.Alarm[] = [] + SUPPORTED_CHAINS.forEach((chainId) => { + if (CHAINS_NOT_MONITORED.includes(chainId)) { + return + } + + const simulationAlarmSev2 = new aws_cloudwatch.Alarm(this, `RoutingAPI-SEV2-SimulationChainId${chainId}`, { + alarmName: `RoutingAPI-SEV2-SimulationChainId${chainId}`, + metric: new MathExpression({ + expression: `100*(simulationSystemDown/simulationRequested)`, + period: Duration.minutes(30), + usingMetrics: { + simulationRequested: new aws_cloudwatch.Metric({ + namespace: 'Uniswap', + metricName: `Simulation Requested`, + dimensionsMap: { Service: 'RoutingAPI' }, + unit: aws_cloudwatch.Unit.COUNT, + statistic: 'sum', + }), + simulationSystemDown: new aws_cloudwatch.Metric({ + namespace: 'Uniswap', + metricName: `SimulationSystemDownChainId${chainId}`, + dimensionsMap: { Service: 'RoutingAPI' }, + unit: aws_cloudwatch.Unit.COUNT, + statistic: 'sum', + }), + }, + }), + threshold: 20, + evaluationPeriods: 3, + treatMissingData: aws_cloudwatch.TreatMissingData.NOT_BREACHING, // Missing data points are treated as "good" and within the threshold + }) + + simulationAlarmByChainSev2.push(simulationAlarmSev2) + }) // Create an alarm for when GraphQLTokenFeeFetcherFetchFeesFailure rate goes above 15%. // We do have on chain fallback in place of GQL failure, but we want to be alerted if the failure rate is high to take action. @@ -580,7 +612,7 @@ export class RoutingAPIStack extends cdk.Stack { apiAlarm5xxSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) apiAlarm4xxSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) apiAlarmLatencySev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) - simulationAlarmSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) + simulationAlarmSev2.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) graphqlTokenFeeFetcherErrorRateSev3.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) percent4XXByChainAlarm.forEach((alarm) => { @@ -595,6 +627,9 @@ export class RoutingAPIStack extends cdk.Stack { successRateByRequestSourceAndChainIdAlarm.forEach((alarm) => { alarm.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) }) + simulationAlarmByChainSev2.forEach((alarm) => { + alarm.addAlarmAction(new aws_cloudwatch_actions.SnsAction(chatBotTopic)) + }) } this.url = new CfnOutput(this, 'Url', { diff --git a/lib/handlers/quote/quote.ts b/lib/handlers/quote/quote.ts index 406648364e..d489c20cde 100644 --- a/lib/handlers/quote/quote.ts +++ b/lib/handlers/quote/quote.ts @@ -512,6 +512,9 @@ export class QuoteHandler extends APIGLambdaHandler< metric.putMetric('SimulationNotApproved', 1, MetricLoggerUnit.Count) } else if (simulationStatus == SimulationStatus.NotSupported) { metric.putMetric('SimulationNotSupported', 1, MetricLoggerUnit.Count) + } else if (simulationStatus == SimulationStatus.SystemDown) { + metric.putMetric('SimulationSystemDown', 1, MetricLoggerUnit.Count) + metric.putMetric(`SimulationSystemDownChainId${chainId}`, 1, MetricLoggerUnit.Count) } const routeResponse: Array = []