diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index f0b6c93d304..dc287435096 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -15597,6 +15597,17 @@ "fieldType": "string", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "strict_initialization_mode", + "required": false, + "desc": "Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "alertmanager.strict-initialization-mode", + "fieldType": "boolean", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_concurrent_get_requests_per_tenant", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index fdea624ec68..232ec2b8109 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -355,6 +355,8 @@ Usage of ./cmd/mimir/mimir: Directory to store Alertmanager state and temporarily configuration files. The content of this directory is not required to be persisted between restarts unless Alertmanager replication has been disabled. (default "./data-alertmanager/") -alertmanager.storage.retention duration How long should we store stateful data (notification logs and silences). For notification log entries, refers to how long should we keep entries before they expire and are deleted. For silences, refers to how long should tenants view silences after they expire and are deleted. (default 120h0m0s) + -alertmanager.strict-initialization-mode + [experimental] Skip starting the Alertmanager for tenants without a non-default, non-empty configuration. -alertmanager.utf8-migration-logging-enabled [experimental] Enable logging of tenant configurations that are incompatible with UTF-8 strict mode. -alertmanager.utf8-strict-mode-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 34381239ddd..09d6557ef49 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2377,6 +2377,11 @@ sharding_ring: # CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix [grafana_alertmanager_conditionally_skip_tenant_suffix: | default = ""] +# (experimental) Skip starting the Alertmanager for tenants without a +# non-default, non-empty configuration. +# CLI flag: -alertmanager.strict-initialization-mode +[strict_initialization_mode: | default = false] + # (advanced) Maximum number of concurrent GET requests allowed per tenant. The # zero value (and negative values) result in a limit of GOMAXPROCS or 8, # whichever is larger. Status code 503 is served for GET requests that would diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index cfbc3572556..65b6314330d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -85,6 +85,7 @@ type MultitenantAlertmanagerConfig struct { GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"` + StrictInitializationMode bool `yaml:"strict_initialization_mode" category:"experimental"` MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"` @@ -129,6 +130,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.") f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.") f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.") + f.BoolVar(&cfg.StrictInitializationMode, "alertmanager.strict-initialization-mode", false, "Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.") f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.") f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.") @@ -324,6 +326,9 @@ type MultitenantAlertmanager struct { tenantsDiscovered prometheus.Gauge syncTotal *prometheus.CounterVec syncFailures *prometheus.CounterVec + + lolMtx sync.RWMutex + receivingAlerts map[string]struct{} } // NewMultitenantAlertmanager creates a new MultitenantAlertmanager. @@ -397,6 +402,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC registry: registerer, limits: limits, features: features, + receivingAlerts: map[string]struct{}{}, ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ Name: "cortex_alertmanager_ring_check_errors_total", Help: "Number of errors that have occurred when checking the ring for ownership.", @@ -677,7 +683,7 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s } if !startAM { - level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user) + level.Debug(am.logger).Log("msg", "not initializing alertmanager for tenant", "user", user) amInitSkipped[user] = struct{}{} continue } @@ -723,20 +729,33 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s // computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations. // It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant. func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) { + isGrafanaCfgUsable := cfgs.Grafana.Promoted && !cfgs.Grafana.Default + isMimirCfgUsable := cfgs.Mimir.RawConfig != "" && cfgs.Mimir.RawConfig != am.fallbackConfig + if am.cfg.StrictInitializationMode && !isGrafanaCfgUsable && !isMimirCfgUsable { + // Skip starting the Alertmanager if we have no usable configurations. + am.lolMtx.RLock() + _, ok := am.receivingAlerts[cfgs.Mimir.User] + am.lolMtx.RUnlock() + if !ok { + return amConfig{}, false, nil + } + level.Debug(am.logger).Log("msg", "user has no usable config but is receiving alerts, starting Alertmanager", "user", cfgs.Mimir.User) + } + cfg := amConfig{ AlertConfigDesc: cfgs.Mimir, tmplExternalURL: am.cfg.ExternalURL.URL, } // If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration. - if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" { + if !isGrafanaCfgUsable || cfgs.Grafana.RawConfig == "" { level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User) isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix) return cfg, !isGrafanaTenant, nil } // If the Mimir configuration is either default or empty, use the Grafana configuration. - if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" { + if !isMimirCfgUsable { level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User) cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig) return cfg, true, err @@ -1005,6 +1024,22 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http return } + if req.URL.Path == "/alertmanager/api/v2/alerts" && req.Method == http.MethodPost { + am.lolMtx.Lock() + am.receivingAlerts[userID] = struct{}{} + am.lolMtx.Unlock() + userAM, err = am.startAlertmanager(req.Context(), userID) + if err != nil { + level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager", "user", userID, "err", err) + http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError) + return + } + + level.Debug(am.logger).Log("msg", "alerts received, Alertmanager initialized", "user", userID, "err", err) + userAM.mux.ServeHTTP(w, req) + return + } + if am.fallbackConfig != "" { userAM, err = am.alertmanagerFromFallbackConfig(req.Context(), userID) if errors.Is(err, errNotUploadingFallback) { @@ -1025,6 +1060,53 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http http.Error(w, "the Alertmanager is not configured", http.StatusPreconditionFailed) } +func (am *MultitenantAlertmanager) startAlertmanager(ctx context.Context, userID string) (*Alertmanager, error) { + if !am.isUserOwned(userID) { + return nil, errors.Wrap(errNotUploadingFallback, "user not owned by this instance") + } + + cfg, err := am.store.GetAlertConfig(ctx, userID) + if err != nil { + if !errors.Is(err, alertspb.ErrNotFound) { + return nil, errors.Wrap(err, "failed to check for existing configuration") + } + + level.Warn(am.logger).Log("msg", "no configuration exists for user; uploading fallback configuration", "user", userID) + + // Upload an empty config so that the Alertmanager is not de-activated in the next poll. + cfgDesc := alertspb.ToProto("", nil, userID) + err = am.store.SetAlertConfig(ctx, cfgDesc) + if err != nil { + return nil, err + } + + // Calling setConfig with an empty configuration will use the fallback config. + amConfig := amConfig{ + AlertConfigDesc: cfgDesc, + tmplExternalURL: am.cfg.ExternalURL.URL, + } + err = am.setConfig(amConfig) + if err != nil { + return nil, err + } + + am.alertmanagersMtx.Lock() + defer am.alertmanagersMtx.Unlock() + return am.alertmanagers[userID], nil + } + + amConfig := amConfig{ + AlertConfigDesc: cfg, + tmplExternalURL: am.cfg.ExternalURL.URL, + } + if err := am.setConfig(amConfig); err != nil { + return nil, err + } + am.alertmanagersMtx.Lock() + defer am.alertmanagersMtx.Unlock() + return am.alertmanagers[userID], nil +} + func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(ctx context.Context, userID string) (*Alertmanager, error) { // Make sure we never create fallback instances for a user not owned by this instance. // This check is not strictly necessary as the configuration polling loop will deactivate