-
Notifications
You must be signed in to change notification settings - Fork 527
[AWS Content Packs] [OOTB Alerts] Add alerting templates #16750
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4e63f7a
a26bc1d
34e0b3f
83f3fef
d7273e6
e75240e
5decaef
561461a
a475f2a
c7e2439
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-cloudtrail-otel-high-error-rate", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS CloudTrail OTEL] High error rate", | ||
| "tags": ["AWS CloudTrail Logs OpenTelemetry Assets"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any source IP address whose critical error count exceed a threshold (e.g. > 5 in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.cloudtrail.otel-default| WHERE aws.error.code IN (\"InvalidClientTokenId\",\"SignatureDoesNotMatch\",\"InvalidAccessKeyId\",\"ExpiredToken\",\"InvalidToken\",\"InvalidPassword\",\"Failed authentication\",\"UnrecognizedClientException\",\"AccessDenied\",\"AccessDeniedException\",\"UnauthorizedOperation\")| STATS error_count = COUNT(*) BY source.address| WHERE error_count > 5" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-cloudtrail-otel-high-resource-deletion", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS CloudTrail OTEL] High resource deletion", | ||
| "tags": ["AWS CloudTrail Logs OpenTelemetry Assets"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any source IP address whose delete requests exceed a threshold (e.g. > 5 in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.cloudtrail.otel-default | WHERE aws.error.code IS NULL | WHERE rpc.method IN (\"TerminateInstances\",\"DeleteBucket\",\"DeleteDBInstance\",\"DeleteFunction\",\"DeleteVolume\",\"DeleteSnapshot\") | STATS deletion_count = COUNT(*) BY source.address | WHERE deletion_count >= 5" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-cloudtrail-otel-high-risk-actions-succeeded", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS CloudTrail OTEL] High-risk actions succeeded", | ||
| "tags": ["AWS CloudTrail Logs OpenTelemetry Assets"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any high risk actions succeded within a given threshold time from a single user or IP\nFROM logs-aws.cloudtrail.otel-default | WHERE rpc.method IN (\"StopLogging\", \"DeleteTrail\", \"UpdateTrail\", \"AttachUserPolicy\", \"AttachRolePolicy\", \"PutUserPolicy\", \"PutRolePolicy\", \"CreateAccessKey\", \"CreateUser\", \"CreateLoginProfile\", \"DisableKey\", \"ScheduleKeyDeletion\", \"DeleteBucket\", \"PutBucketPolicy\", \"PutBucketLogging\", \"DeleteDetector\", \"DeleteMembers\", \"DisassociateFromMasterAccount\", \"DeleteFlowLogs\", \"DeleteAlarms\", \"DeleteConfigRule\", \"DeleteEventBusRule\") AND aws.error.code IS NULL | STATS action_count = COUNT(*), actions = VALUES(rpc.method), ips = VALUES(source.address) BY aws.principal.arn, user.name | WHERE action_count>1" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-cloudtrail-otel-multiple-failed-login-attempts", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS CloudTrail OTEL] Multiple failed login attempts", | ||
| "tags": ["AWS CloudTrail Logs OpenTelemetry Assets"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does 10m seem like a long time period for detecting failed login attempts? |
||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any source IP address whose reject requests exceed a threshold (e.g. > 100 in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.cloudtrail.otel-default | WHERE @timestamp > NOW()- 10m | WHERE rpc.method == \"ConsoleLogin\" | WHERE aws.error.code IS NOT NULL | STATS failed_count = COUNT(*), users_tried = VALUES(user.name) BY source.address | WHERE failed_count >= 100 | SORT failed_count DESC" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i had a few concerns about this rule and did a sanity check by asking chatgpt for some feedback. it has a lot of concerns about this rule. did we get an LLM to thouroughly review all the queries here? i don't know if the concerns are valid, but i just want to check we have considered feedback like this. please DM me for the detail i got from GPT, but the summary was:
|
||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| { | ||
| "id": "aws-elb-otel-application-errors", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS ELB OTEL] Application errors", | ||
| "tags": [ | ||
| "AWS Elb Logs OpenTelemetry Assets" | ||
| ], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any client resource.id whose error count exceed a threshold (e.g. 50 in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.elbaccess.otel-default | WHERE aws.elb.status.code >= 400| STATS error_count = COUNT(*) BY cloud.resource_id | WHERE error_count >= 50" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-elb-otel-backend-errors", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS ELB OTEL] Backend errors", | ||
| "tags": ["AWS Elb Logs OpenTelemetry Assets"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any resource.id whose backend service error count exceed a threshold (e.g. > 50 in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.elbaccess.otel-default | WHERE aws.elb.backend.status.code >= 500| STATS backend_error_count = COUNT(*), BY cloud.resource_id | WHERE backend_error_count >= 50" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-vpcflow-otel-high-data-transfer-rate", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS VPC OTEL] High data transfer rate", | ||
| "tags": ["AWS VPC Logs OpenTelemetry Assets"], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this isn't a good tag name we should have tags for 'aws', 'vpc' (and possibly 'otel'?)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (same for all other rules in this PR) |
||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't think we need to include
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (same for all other rules in this PR) |
||
| "esql": "// Alert triggers when any source whose bytes exceed a threshold (e.g. > 50GB in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.vpcflow.otel-default| WHERE aws.vpc.flow.action == \"ACCEPT\"| STATS total_bytes = SUM(aws.vpc.flow.bytes) BY network.interface.name| WHERE total_bytes > 53687091200" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "id": "aws-vpcflow-otel-high-reject-actions", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "[AWS VPC OTEL] High reject actions", | ||
| "tags": ["AWS VPC Logs OpenTelemetry Assets"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "5m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 10, | ||
| "timeWindowUnit": "m", | ||
| "esqlQuery": { | ||
| "esql": "// Alert triggers when any source whose reject requests exceed a threshold (e.g. > 1000 in 10 minutes)\n// You can adjust the threshold value in WHERE clause as needed.\nFROM logs-aws.vpcflow.otel-default| WHERE aws.vpc.flow.action == \"REJECT\"| STATS reject_count = COUNT(*) BY network.interface.name| WHERE reject_count > 100" | ||
| }, | ||
| "groupBy": "row", | ||
| "timeField": "@timestamp" | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should be 15m to match description