From 6d508d6681ac73829d4490f235a49bb43808165a Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Wed, 28 Jul 2021 18:01:31 +0200 Subject: [PATCH 1/8] Add detector for fluentd logs with prometheus --- modules/smart-agent_fluentd/README.md | 111 ++++++++++++++++++ modules/smart-agent_fluentd/common-filters.tf | 1 + modules/smart-agent_fluentd/common-locals.tf | 1 + modules/smart-agent_fluentd/common-modules.tf | 1 + .../smart-agent_fluentd/common-variables.tf | 1 + .../smart-agent_fluentd/common-versions.tf | 1 + modules/smart-agent_fluentd/conf/readme.yaml | 42 +++++++ modules/smart-agent_fluentd/detectors-gen.tf | 52 ++++++++ modules/smart-agent_fluentd/outputs.tf | 9 ++ modules/smart-agent_fluentd/tags.tf | 4 + modules/smart-agent_fluentd/variables-gen.tf | 81 +++++++++++++ 11 files changed, 304 insertions(+) create mode 100644 modules/smart-agent_fluentd/README.md create mode 120000 modules/smart-agent_fluentd/common-filters.tf create mode 120000 modules/smart-agent_fluentd/common-locals.tf create mode 120000 modules/smart-agent_fluentd/common-modules.tf create mode 120000 modules/smart-agent_fluentd/common-variables.tf create mode 120000 modules/smart-agent_fluentd/common-versions.tf create mode 100644 modules/smart-agent_fluentd/conf/readme.yaml create mode 100644 modules/smart-agent_fluentd/detectors-gen.tf create mode 100644 modules/smart-agent_fluentd/outputs.tf create mode 100644 modules/smart-agent_fluentd/tags.tf create mode 100644 modules/smart-agent_fluentd/variables-gen.tf diff --git a/modules/smart-agent_fluentd/README.md b/modules/smart-agent_fluentd/README.md new file mode 100644 index 000000000..6d540dd7c --- /dev/null +++ b/modules/smart-agent_fluentd/README.md @@ -0,0 +1,111 @@ +# Fluentd SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/docs/modules/usage.html) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-smart-agent-fluentd" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/smart-agent_fluentd?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/docs/modules/sources.html)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + [filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/docs/configuration/types.html#object-) where each key represents an + available [detector rule severity](https://docs.signalfx.com/en/latest/detect-alert/set-up-detectors.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/docs/configuration/variables.html) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailled in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Fluentd heartbeat|-|X|-|-|-| +|Fluentd buffer application|-|X|-|-|-| + +## How to collect required metrics? + +This module uses metrics available from +[monitors](https://docs.signalfx.com/en/latest/integrations/agent/monitors/_monitor-config.html) +available in the [SignalFx Smart +Agent](https://github.com/signalfx/signalfx-agent). Check the "Related documentation" section for more +information including the official documentation of this monitor. + + + + +### Metrics + + +To filter only required metrics for the detectors of this module, add the +[datapointsToExclude](https://docs.signalfx.com/en/latest/integrations/agent/filtering.html) parameter to +the corresponding monitor configuration: + +```yaml + - metricNames: + - '*' + - '!fluentd_output_status_buffer_stage_length' + +``` + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) diff --git a/modules/smart-agent_fluentd/common-filters.tf b/modules/smart-agent_fluentd/common-filters.tf new file mode 120000 index 000000000..4df54e41e --- /dev/null +++ b/modules/smart-agent_fluentd/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-smart-agent.tf \ No newline at end of file diff --git a/modules/smart-agent_fluentd/common-locals.tf b/modules/smart-agent_fluentd/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/smart-agent_fluentd/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/smart-agent_fluentd/common-modules.tf b/modules/smart-agent_fluentd/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/smart-agent_fluentd/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/smart-agent_fluentd/common-variables.tf b/modules/smart-agent_fluentd/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/smart-agent_fluentd/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/smart-agent_fluentd/common-versions.tf b/modules/smart-agent_fluentd/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/smart-agent_fluentd/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/smart-agent_fluentd/conf/readme.yaml b/modules/smart-agent_fluentd/conf/readme.yaml new file mode 100644 index 000000000..dbc7a47bc --- /dev/null +++ b/modules/smart-agent_fluentd/conf/readme.yaml @@ -0,0 +1,42 @@ +documentations: + - name: Smart Agent monitor + url: 'https://docs.signalfx.com/en/latest/integrations/agent/monitors/prometheus-exporter.html' + +source_doc: | + ### Agent + + Here is the official [main + documentation](https://docs.signalfx.com/en/latest/integrations/integrations-reference/integrations.kubernetes.html) for + kubernetes including the `signalfx-agent` installation which must be installed as + [daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) on your cluster. + + ### Monitors + + The detectors in this module are based on metrics reported by the following monitors: + + * [prometheus-exporter](https://docs.signalfx.com/en/latest/integrations/agent/monitors/prometheus-exporter.html) + + Detectors in this module will at least require these metric: + + * `fluentd_output_status_buffer_stage_length` + + This metric is enabled by default, but it is the only one used on the detectors. + + ### Fluentd Prometheus + + For using this detector, the instance has to have an entrypoint prometheus exposing the information of fluentd. + You have to install fluent-plugin-prometheus on the server and adding the monitoring configuration to td-agent to use it. + More info on the doc [Fluentd](https://docs.fluentd.org/monitoring-fluentd/monitoring-prometheus) + +notes: | + Here an example of the monitor configuration + ```yaml + - type: prometheus-exporter + host: 127.0.0.1 + port: 24231 + - metricNames: + - '*' + - '!fluentd_output_status_buffer_stage_length' + ``` + + You can find all the metrics on the repository [fluent-plugin-prometheus](https://github.com/fluent/fluent-plugin-prometheus/blob/master/README.md) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf new file mode 100644 index 000000000..2edd678c2 --- /dev/null +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -0,0 +1,52 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Fluentd heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + max_delay = 900 + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('MAJOR') +EOF + + rule { + description = "is too high >= ${var.heartbeat_timeframe}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.heartbeat_notifications, "major", []), var.notifications.major) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } +} + +resource "signalfx_detector" "buffer" { + name = format("%s %s", local.detector_name_prefix, "Fluentd Buffer length") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') + detect(when(signal < ${var.buffer_threshold})).publish('MAJOR') +EOF + + rule { + description = "is too low < ${var.buffer_threshold}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.buffer_disabled, var.buffer_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.buffer_notifications, "major", []), var.notifications.major) + runbook_url = try(coalesce(var.buffer_runbook_url, var.runbook_url), "") + tip = var.buffer_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } +} diff --git a/modules/smart-agent_fluentd/outputs.tf b/modules/smart-agent_fluentd/outputs.tf new file mode 100644 index 000000000..6be2d8ba3 --- /dev/null +++ b/modules/smart-agent_fluentd/outputs.tf @@ -0,0 +1,9 @@ +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + +output "buffer" { + description = "Detector resource for buffer length" + value = signalfx_detector.buffer +} diff --git a/modules/smart-agent_fluentd/tags.tf b/modules/smart-agent_fluentd/tags.tf new file mode 100644 index 000000000..af7861d22 --- /dev/null +++ b/modules/smart-agent_fluentd/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["smart-agent", "fluentd"] +} + diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf new file mode 100644 index 000000000..9e845874c --- /dev/null +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -0,0 +1,81 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".sum(by=['host'])" +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"10m\")" + type = string + default = "20m" +} + +# buffer detector + +variable "buffer_notifications" { + description = "Notification recipients list per severity overridden for buffer detector" + type = map(list(string)) + default = {} +} + +variable "buffer_aggregation_function" { + description = "Aggregation function and group by for buffer detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".sum(by=['host'])" +} + +variable "buffer_transformation_function" { + description = "Transformation function for buffer detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='5min')" +} + +variable "buffer_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "buffer_disabled" { + description = "Disable all alerting rules for buffer detector" + type = bool + default = null +} + +variable "buffer_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "buffer_threshold" { + description = "Major threshold for up detector" + type = number + default = 1 +} From 1c27607da87526efd59d5278ef9b71aa29f04d2a Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Fri, 3 Sep 2021 15:53:59 +0200 Subject: [PATCH 2/8] smart-agent_fluentd - Add autoresolve on buffer detector where there is no data --- modules/smart-agent_fluentd/detectors-gen.tf | 2 +- modules/smart-agent_fluentd/variables-gen.tf | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf index 2edd678c2..154653d6b 100644 --- a/modules/smart-agent_fluentd/detectors-gen.tf +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -35,7 +35,7 @@ resource "signalfx_detector" "buffer" { program_text = <<-EOF signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') - detect(when(signal < ${var.buffer_threshold})).publish('MAJOR') + detect(when(signal < ${var.buffer_threshold}) and not when(signal is None, '${var.buffer_auto_resolve_after}')).publish('MAJOR') EOF rule { diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 9e845874c..7c1adce46 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -51,9 +51,9 @@ variable "buffer_aggregation_function" { } variable "buffer_transformation_function" { - description = "Transformation function for buffer detector (i.e. \".mean(over='5m')\")" + description = "Transformation function for buffer detector (i.e. \".mean(over='10m')\")" type = string - default = ".max(over='5min')" + default = ".max(over='10min')" } variable "buffer_tip" { @@ -79,3 +79,9 @@ variable "buffer_threshold" { type = number default = 1 } + +variable "buffer_auto_resolve_after" { + description = "Auto resolve the alert if there are no DATA after some time (i.e. \"5m\")" + type = string + default = "5m" +} From 8519811f3118d8ab38dc15c3e70f7dd4483be7d1 Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Mon, 20 Sep 2021 13:27:25 +0200 Subject: [PATCH 3/8] smart-agent_fluentd - replace autoresolve by clear_duration --- modules/smart-agent_fluentd/detectors-gen.tf | 3 +-- modules/smart-agent_fluentd/variables-gen.tf | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf index 154653d6b..5c425d0ba 100644 --- a/modules/smart-agent_fluentd/detectors-gen.tf +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -35,9 +35,8 @@ resource "signalfx_detector" "buffer" { program_text = <<-EOF signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') - detect(when(signal < ${var.buffer_threshold}) and not when(signal is None, '${var.buffer_auto_resolve_after}')).publish('MAJOR') + detect(when(signal < ${var.buffer_threshold}), off=when(signal is None, '${var.buffer_auto_clear_duration}')).publish('MAJOR') EOF - rule { description = "is too low < ${var.buffer_threshold}" severity = "Major" diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 7c1adce46..5b4a50594 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -80,8 +80,8 @@ variable "buffer_threshold" { default = 1 } -variable "buffer_auto_resolve_after" { - description = "Auto resolve the alert if there are no DATA after some time (i.e. \"5m\")" +variable "buffer_auto_clear_duration" { + description = "Duration for the buffer_auto clear condition" type = string default = "5m" } From 5e2bbbbd3b59ae4694cb9e736b0d1126c136fd5b Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Thu, 14 Oct 2021 15:31:25 +0200 Subject: [PATCH 4/8] smart-agent_fluentd - Remove off function for fixing the autoresolve on no data --- modules/smart-agent_fluentd/detectors-gen.tf | 2 +- modules/smart-agent_fluentd/variables-gen.tf | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf index 5c425d0ba..042db937d 100644 --- a/modules/smart-agent_fluentd/detectors-gen.tf +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -35,7 +35,7 @@ resource "signalfx_detector" "buffer" { program_text = <<-EOF signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') - detect(when(signal < ${var.buffer_threshold}), off=when(signal is None, '${var.buffer_auto_clear_duration}')).publish('MAJOR') + detect(when(signal < ${var.buffer_threshold})).publish('MAJOR') EOF rule { description = "is too low < ${var.buffer_threshold}" diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 5b4a50594..353ce2bd1 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -80,8 +80,3 @@ variable "buffer_threshold" { default = 1 } -variable "buffer_auto_clear_duration" { - description = "Duration for the buffer_auto clear condition" - type = string - default = "5m" -} From 64292d796a3f9dc17f599f1486511eb5f7a0598a Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Thu, 14 Oct 2021 16:01:12 +0200 Subject: [PATCH 5/8] smart-agent_fluentd - Add lasting to buffer alert --- modules/smart-agent_fluentd/detectors-gen.tf | 2 +- modules/smart-agent_fluentd/variables-gen.tf | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf index 042db937d..4a71c6839 100644 --- a/modules/smart-agent_fluentd/detectors-gen.tf +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -35,7 +35,7 @@ resource "signalfx_detector" "buffer" { program_text = <<-EOF signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') - detect(when(signal < ${var.buffer_threshold})).publish('MAJOR') + detect(when(signal < ${var.buffer_threshold}, lasting="${var.buffer_lasting_seconds}s")).publish('MAJOR') EOF rule { description = "is too low < ${var.buffer_threshold}" diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 353ce2bd1..891a946a9 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -80,3 +80,8 @@ variable "buffer_threshold" { default = 1 } +variable "buffer_lasting_seconds" { + description = "Minimum duration that conditions must be true before raising alert (in seconds)" + type = number + default = 300 +} From 30cb74614e2e8ecb9f53a1c9bc71e3035f425934 Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Fri, 29 Oct 2021 13:25:27 +0200 Subject: [PATCH 6/8] smart-agent_fluentd - Replace metric fluentd_output_status_buffer_stage_length by fluentd_output_status_buffer_queue_length on the fluend detector --- modules/smart-agent_fluentd/detectors-gen.tf | 8 ++++---- modules/smart-agent_fluentd/variables-gen.tf | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf index 4a71c6839..674b46c4f 100644 --- a/modules/smart-agent_fluentd/detectors-gen.tf +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -9,7 +9,7 @@ resource "signalfx_detector" "heartbeat" { program_text = <<-EOF from signalfx.detectors.not_reporting import not_reporting - signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + signal = data('fluentd_output_status_buffer_queue_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('MAJOR') EOF @@ -34,11 +34,11 @@ resource "signalfx_detector" "buffer" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('fluentd_output_status_buffer_stage_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') - detect(when(signal < ${var.buffer_threshold}, lasting="${var.buffer_lasting_seconds}s")).publish('MAJOR') + signal = data('fluentd_output_status_buffer_queue_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') + detect(when(signal > ${var.buffer_threshold}, lasting="${var.buffer_lasting_seconds}s")).publish('MAJOR') EOF rule { - description = "is too low < ${var.buffer_threshold}" + description = "is too high > ${var.buffer_threshold}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.buffer_disabled, var.buffer_disabled, var.detectors_disabled) diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 891a946a9..5b30bee80 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -77,7 +77,7 @@ variable "buffer_runbook_url" { variable "buffer_threshold" { description = "Major threshold for up detector" type = number - default = 1 + default = 10 } variable "buffer_lasting_seconds" { From f9faff84d4de933989ef0967dc5f05c829ea72b3 Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Wed, 10 Nov 2021 16:27:22 +0100 Subject: [PATCH 7/8] smart-agent_fluentd - replace max by min on the default buffer_transformation_function --- modules/smart-agent_fluentd/variables-gen.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 5b30bee80..3b7877b80 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -53,7 +53,7 @@ variable "buffer_aggregation_function" { variable "buffer_transformation_function" { description = "Transformation function for buffer detector (i.e. \".mean(over='10m')\")" type = string - default = ".max(over='10min')" + default = ".min(over='10min')" } variable "buffer_tip" { From c58881034c153d996103d1c4fbea0c6e859cc6f7 Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Tue, 30 Jan 2024 16:09:23 +0100 Subject: [PATCH 8/8] smart-agent_fluentd - add auto_resolve_after parameter --- modules/smart-agent_fluentd/detectors-gen.tf | 2 +- modules/smart-agent_fluentd/variables-gen.tf | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/smart-agent_fluentd/detectors-gen.tf b/modules/smart-agent_fluentd/detectors-gen.tf index 674b46c4f..daaec399e 100644 --- a/modules/smart-agent_fluentd/detectors-gen.tf +++ b/modules/smart-agent_fluentd/detectors-gen.tf @@ -35,7 +35,7 @@ resource "signalfx_detector" "buffer" { program_text = <<-EOF signal = data('fluentd_output_status_buffer_queue_length', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.buffer_aggregation_function}${var.buffer_transformation_function}.publish('signal') - detect(when(signal > ${var.buffer_threshold}, lasting="${var.buffer_lasting_seconds}s")).publish('MAJOR') + detect(when(signal > ${var.buffer_threshold}, lasting="${var.buffer_lasting_seconds}s"), auto_resolve_after="${var.buffer_auto_resolve_seconds}s").publish('MAJOR') EOF rule { description = "is too high > ${var.buffer_threshold}" diff --git a/modules/smart-agent_fluentd/variables-gen.tf b/modules/smart-agent_fluentd/variables-gen.tf index 3b7877b80..54d6f3410 100644 --- a/modules/smart-agent_fluentd/variables-gen.tf +++ b/modules/smart-agent_fluentd/variables-gen.tf @@ -85,3 +85,10 @@ variable "buffer_lasting_seconds" { type = number default = 300 } + +variable "buffer_auto_resolve_seconds" { + description = "Alert duration after NO DATA (in seconds)" + type = number + default = 60 +} +