From de252bb809610479fdf088a33e11a1752fbbbd89 Mon Sep 17 00:00:00 2001 From: Santiago Date: Fri, 24 Jan 2025 16:39:17 +0100 Subject: [PATCH 1/4] (PoC) Alertmanager: Strict initialization mode for the Alertmanager --- .../configure/configuration-parameters/index.md | 5 +++++ pkg/alertmanager/multitenant.go | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 34381239ddd..09d6557ef49 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2377,6 +2377,11 @@ sharding_ring: # CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix [grafana_alertmanager_conditionally_skip_tenant_suffix: | default = ""] +# (experimental) Skip starting the Alertmanager for tenants without a +# non-default, non-empty configuration. +# CLI flag: -alertmanager.strict-initialization-mode +[strict_initialization_mode: | default = false] + # (advanced) Maximum number of concurrent GET requests allowed per tenant. The # zero value (and negative values) result in a limit of GOMAXPROCS or 8, # whichever is larger. Status code 503 is served for GET requests that would diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index cfbc3572556..b3f66a1c416 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -85,6 +85,7 @@ type MultitenantAlertmanagerConfig struct { GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"` + StrictInitializationMode bool `yaml:"strict_initialization_mode" category:"experimental"` MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"` @@ -129,6 +130,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.") f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.") f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.") + f.BoolVar(&cfg.StrictInitializationMode, "alertmanager.strict-initialization-mode", false, "Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.") f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.") f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.") @@ -677,7 +679,7 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s } if !startAM { - level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user) + level.Debug(am.logger).Log("msg", "not initializing alertmanager for tenant", "user", user) amInitSkipped[user] = struct{}{} continue } @@ -723,20 +725,27 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s // computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations. // It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant. func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) { + isGrafanaCfgUsable := cfgs.Grafana.Promoted && !cfgs.Grafana.Default + isMimirCfgUsable := cfgs.Mimir.RawConfig != "" && cfgs.Mimir.RawConfig != am.fallbackConfig + if am.cfg.StrictInitializationMode && !isGrafanaCfgUsable && !isMimirCfgUsable { + // Skip starting the Alertmanager if we have no usable configurations. + return amConfig{}, false, nil + } + cfg := amConfig{ AlertConfigDesc: cfgs.Mimir, tmplExternalURL: am.cfg.ExternalURL.URL, } // If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration. - if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" { + if !isGrafanaCfgUsable || cfgs.Grafana.RawConfig == "" { level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User) isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix) return cfg, !isGrafanaTenant, nil } // If the Mimir configuration is either default or empty, use the Grafana configuration. - if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" { + if !isMimirCfgUsable { level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User) cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig) return cfg, true, err From a0e52e5fb050b0c14bf35263677fb191c376e44e Mon Sep 17 00:00:00 2001 From: Santiago Date: Fri, 24 Jan 2025 17:04:10 +0100 Subject: [PATCH 2/4] make reference-help --- cmd/mimir/config-descriptor.json | 11 +++++++++++ cmd/mimir/help-all.txt.tmpl | 2 ++ 2 files changed, 13 insertions(+) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index f0b6c93d304..dc287435096 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -15597,6 +15597,17 @@ "fieldType": "string", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "strict_initialization_mode", + "required": false, + "desc": "Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "alertmanager.strict-initialization-mode", + "fieldType": "boolean", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_concurrent_get_requests_per_tenant", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index fdea624ec68..232ec2b8109 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -355,6 +355,8 @@ Usage of ./cmd/mimir/mimir: Directory to store Alertmanager state and temporarily configuration files. The content of this directory is not required to be persisted between restarts unless Alertmanager replication has been disabled. (default "./data-alertmanager/") -alertmanager.storage.retention duration How long should we store stateful data (notification logs and silences). For notification log entries, refers to how long should we keep entries before they expire and are deleted. For silences, refers to how long should tenants view silences after they expire and are deleted. (default 120h0m0s) + -alertmanager.strict-initialization-mode + [experimental] Skip starting the Alertmanager for tenants without a non-default, non-empty configuration. -alertmanager.utf8-migration-logging-enabled [experimental] Enable logging of tenant configurations that are incompatible with UTF-8 strict mode. -alertmanager.utf8-strict-mode-enabled From 91a8ea07c50f70b2eb29cef6981be3c0d1f9576e Mon Sep 17 00:00:00 2001 From: Santiago Date: Fri, 24 Jan 2025 20:28:21 +0100 Subject: [PATCH 3/4] whitelist tenants receiving alerts --- pkg/alertmanager/multitenant.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index b3f66a1c416..101346b0a4b 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -326,6 +326,9 @@ type MultitenantAlertmanager struct { tenantsDiscovered prometheus.Gauge syncTotal *prometheus.CounterVec syncFailures *prometheus.CounterVec + + lolMtx sync.RWMutex + receivingAlerts map[string]struct{} } // NewMultitenantAlertmanager creates a new MultitenantAlertmanager. @@ -399,6 +402,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC registry: registerer, limits: limits, features: features, + receivingAlerts: map[string]struct{}{}, ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ Name: "cortex_alertmanager_ring_check_errors_total", Help: "Number of errors that have occurred when checking the ring for ownership.", @@ -729,7 +733,13 @@ func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) isMimirCfgUsable := cfgs.Mimir.RawConfig != "" && cfgs.Mimir.RawConfig != am.fallbackConfig if am.cfg.StrictInitializationMode && !isGrafanaCfgUsable && !isMimirCfgUsable { // Skip starting the Alertmanager if we have no usable configurations. - return amConfig{}, false, nil + am.lolMtx.RLock() + _, ok := am.receivingAlerts[cfgs.Mimir.User] + am.lolMtx.RUnlock() + if !ok { + return amConfig{}, false, nil + } + fmt.Printf("%s is receiving alerts!", cfgs.Mimir.User) } cfg := amConfig{ @@ -1014,6 +1024,12 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http return } + if req.URL.Path == "/alertmanager/api/v2/alerts" && req.Method == http.MethodPost { + am.lolMtx.Lock() + am.receivingAlerts[userID] = struct{}{} + am.lolMtx.Unlock() + } + if am.fallbackConfig != "" { userAM, err = am.alertmanagerFromFallbackConfig(req.Context(), userID) if errors.Is(err, errNotUploadingFallback) { From fa2f2d91f179eb6ecb04bad874bcd86c1f7a0765 Mon Sep 17 00:00:00 2001 From: Santiago Date: Mon, 27 Jan 2025 15:11:33 +0100 Subject: [PATCH 4/4] initialize Alertmanagers on-demand --- pkg/alertmanager/multitenant.go | 59 ++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 101346b0a4b..65b6314330d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -739,7 +739,7 @@ func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) if !ok { return amConfig{}, false, nil } - fmt.Printf("%s is receiving alerts!", cfgs.Mimir.User) + level.Debug(am.logger).Log("msg", "user has no usable config but is receiving alerts, starting Alertmanager", "user", cfgs.Mimir.User) } cfg := amConfig{ @@ -1028,6 +1028,16 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http am.lolMtx.Lock() am.receivingAlerts[userID] = struct{}{} am.lolMtx.Unlock() + userAM, err = am.startAlertmanager(req.Context(), userID) + if err != nil { + level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager", "user", userID, "err", err) + http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError) + return + } + + level.Debug(am.logger).Log("msg", "alerts received, Alertmanager initialized", "user", userID, "err", err) + userAM.mux.ServeHTTP(w, req) + return } if am.fallbackConfig != "" { @@ -1050,6 +1060,53 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http http.Error(w, "the Alertmanager is not configured", http.StatusPreconditionFailed) } +func (am *MultitenantAlertmanager) startAlertmanager(ctx context.Context, userID string) (*Alertmanager, error) { + if !am.isUserOwned(userID) { + return nil, errors.Wrap(errNotUploadingFallback, "user not owned by this instance") + } + + cfg, err := am.store.GetAlertConfig(ctx, userID) + if err != nil { + if !errors.Is(err, alertspb.ErrNotFound) { + return nil, errors.Wrap(err, "failed to check for existing configuration") + } + + level.Warn(am.logger).Log("msg", "no configuration exists for user; uploading fallback configuration", "user", userID) + + // Upload an empty config so that the Alertmanager is not de-activated in the next poll. + cfgDesc := alertspb.ToProto("", nil, userID) + err = am.store.SetAlertConfig(ctx, cfgDesc) + if err != nil { + return nil, err + } + + // Calling setConfig with an empty configuration will use the fallback config. + amConfig := amConfig{ + AlertConfigDesc: cfgDesc, + tmplExternalURL: am.cfg.ExternalURL.URL, + } + err = am.setConfig(amConfig) + if err != nil { + return nil, err + } + + am.alertmanagersMtx.Lock() + defer am.alertmanagersMtx.Unlock() + return am.alertmanagers[userID], nil + } + + amConfig := amConfig{ + AlertConfigDesc: cfg, + tmplExternalURL: am.cfg.ExternalURL.URL, + } + if err := am.setConfig(amConfig); err != nil { + return nil, err + } + am.alertmanagersMtx.Lock() + defer am.alertmanagersMtx.Unlock() + return am.alertmanagers[userID], nil +} + func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(ctx context.Context, userID string) (*Alertmanager, error) { // Make sure we never create fallback instances for a user not owned by this instance. // This check is not strictly necessary as the configuration polling loop will deactivate