diff --git a/CHANGELOG.md b/CHANGELOG.md index c31ba2b06fd..638dd7b29a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## main / unreleased * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 +* [FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 ### Grafana Mimir diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 3ac7f5c294b..6b95fb0c29d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4368,6 +4368,50 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_labels", + "required": false, + "desc": "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-labels", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_labels_per_user", + "required": false, + "desc": "Maximum number of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 2, + "fieldFlag": "validation.max-cost-attribution-labels-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_cardinality_per_user", + "required": false, + "desc": "Maximum cardinality of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 10000, + "fieldFlag": "validation.max-cost-attribution-cardinality-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cooldown", + "required": false, + "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.cost-attribution-cooldown", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", @@ -19639,6 +19683,39 @@ "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled", "fieldType": "boolean", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.", + "fieldValue": null, + "fieldDefaultValue": 1200000000000, + "fieldFlag": "cost-attribution.eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_registry_path", + "required": false, + "desc": "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "cost-attribution.registry-path", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cleanup_interval", + "required": false, + "desc": "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.", + "fieldValue": null, + "fieldDefaultValue": 180000000000, + "fieldFlag": "cost-attribution.cleanup-interval", + "fieldType": "duration", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 08bc71314d3..e7d007312a8 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1283,6 +1283,12 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution.cleanup-interval duration + [experimental] Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged. (default 3m0s) + -cost-attribution.eviction-interval duration + [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) + -cost-attribution.registry-path string + [experimental] Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -3317,10 +3323,18 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-cooldown duration + [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit. + -validation.cost-attribution-labels comma-separated-list-of-strings + [experimental] Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-cardinality-per-user int + [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) + -validation.max-cost-attribution-labels-per-user int + [experimental] Maximum number of cost attribution labels allowed per user. (default 2) -validation.max-label-names-per-info-series int Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80) -validation.max-label-names-per-series int diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 3eb5a0de095..5219b71118a 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -46,6 +46,19 @@ Experimental configuration and flags are subject to change. The following features are currently experimental: +- Cost attribution + - Configure labels for cost attribution + - `-validation.cost-attribution-labels` + - Configure cost attribution limits, such as label cardinality and the maximum number of cost attribution labels + - `-validation.max-cost-attribution-labels-per-user` + - `-validation.max-cost-attribution-cardinality-per-user` + - Configure cooldown periods and eviction intervals for cost attribution + - `-validation.cost-attribution-cooldown` + - `-cost-attribution.eviction-interval` + - Configure the metrics endpoint dedicated to cost attribution + - `-cost-attribution.registry-path` + - Configure the cost attribution cleanup process run interval + - `-cost-attribution.cleanup-interval` - Alertmanager - Enable a set of experimental API endpoints to help support the migration of the Grafana Alertmanager to the Mimir Alertmanager. - `-alertmanager.grafana-alertmanager-compatibility-enabled` diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index f9a147e7bca..ca1378a4f37 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -455,6 +455,23 @@ overrides_exporter: # (experimental) Enables optimized marshaling of timeseries. # CLI flag: -timeseries-unmarshal-caching-optimization-enabled [timeseries_unmarshal_caching_optimization_enabled: | default = true] + +# (experimental) Time interval at which inactive cost attributions are evicted +# from the counter, ensuring they are not included in the cost attribution +# cardinality per user limit. +# CLI flag: -cost-attribution.eviction-interval +[cost_attribution_eviction_interval: | default = 20m] + +# (experimental) Defines a custom path for the registry. When specified, Mimir +# exposes cost attribution metrics through this custom path. If not specified, +# cost attribution metrics aren't exposed. +# CLI flag: -cost-attribution.registry-path +[cost_attribution_registry_path: | default = ""] + +# (experimental) Time interval at which the cost attribution cleanup process +# runs, ensuring inactive cost attribution entries are purged. +# CLI flag: -cost-attribution.cleanup-interval +[cost_attribution_cleanup_interval: | default = 3m] ``` ### common @@ -3569,6 +3586,32 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -querier.active-series-results-max-size-bytes [active_series_results_max_size_bytes: | default = 419430400] +# (experimental) Defines labels for cost attribution. Applies to metrics like +# cortex_received_attributed_samples_total. To disable, set to an empty string. +# For example, 'team,service' produces metrics such as +# cortex_received_attributed_samples_total{team='frontend', service='api'}. +# CLI flag: -validation.cost-attribution-labels +[cost_attribution_labels: | default = ""] + +# (experimental) Maximum number of cost attribution labels allowed per user. +# CLI flag: -validation.max-cost-attribution-labels-per-user +[max_cost_attribution_labels_per_user: | default = 2] + +# (experimental) Maximum cardinality of cost attribution labels allowed per +# user. +# CLI flag: -validation.max-cost-attribution-cardinality-per-user +[max_cost_attribution_cardinality_per_user: | default = 10000] + +# (experimental) Cooldown period for cost attribution labels. Specifies the +# duration the cost attribution remains in overflow before attempting a reset. +# If the cardinality remains above the limit after this period, the system stays +# in overflow mode and extends the cooldown. Setting this value to 0 disables +# the cooldown, causing the system to continuously check whether the cardinality +# has dropped below the limit. A reset occurs when the cardinality falls below +# the limit. +# CLI flag: -validation.cost-attribution-cooldown +[cost_attribution_cooldown: | default = 0s] + # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. # CLI flag: -ruler.evaluation-delay-duration diff --git a/go.mod b/go.mod index 9e98c9a00d8..abc92d3cdeb 100644 --- a/go.mod +++ b/go.mod @@ -285,7 +285,7 @@ require ( ) // Using a fork of Prometheus with Mimir-specific changes. -replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673 +replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 // Replace memberlist with our fork which includes some fixes that haven't been // merged upstream yet: diff --git a/go.sum b/go.sum index 8c45d27ee4a..c4ca1735f01 100644 --- a/go.sum +++ b/go.sum @@ -1279,8 +1279,8 @@ github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40 h1:1TeKhyS+pvzO github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40/go.mod h1:IGRj8oOoxwJbHBYl1+OhS9UjQR0dv6SQOep7HqmtyFU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe h1:yIXAAbLswn7VNWBIvM71O2QsgfgW9fRXZNR0DXe6pDU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673 h1:z3nSCBMtEMtD/LAIkwrHsT03n7qgeU+0M6rEMZQbxVI= -github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673/go.mod h1:a5LEa2Vy87wOp0Vu6sLmEIR1V59fqH3QosOSiErAr30= +github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 h1:/TwjdoLAxL7URxKJGJUeI539w6LUqcwIcj0WCUxDY/c= +github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8/go.mod h1:a5LEa2Vy87wOp0Vu6sLmEIR1V59fqH3QosOSiErAr30= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= github.com/grafana/prometheus-alertmanager v0.25.1-0.20240930132144-b5e64e81e8d3 h1:6D2gGAwyQBElSrp3E+9lSr7k8gLuP3Aiy20rweLWeBw= diff --git a/pkg/api/api.go b/pkg/api/api.go index fd8cdd261f4..c90a6a865a1 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -20,6 +20,7 @@ import ( "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/server" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/grafana/mimir/pkg/alertmanager" "github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb" @@ -281,6 +282,11 @@ func (a *API) RegisterDistributor(d *distributor.Distributor, pushConfig distrib a.RegisterRoute("/distributor/ha_tracker", d.HATracker, false, true, "GET") } +// RegisterCostAttribution registers a Prometheus HTTP handler for the cost attribution metrics. +func (a *API) RegisterCostAttribution(customRegistryPath string, reg *prometheus.Registry) { + a.RegisterRoute(customRegistryPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), false, false, "GET") +} + // Ingester is defined as an interface to allow for alternative implementations // of ingesters to be passed into the API.RegisterIngester() method. type Ingester interface { diff --git a/pkg/blockbuilder/tsdb.go b/pkg/blockbuilder/tsdb.go index ee2d610fe78..97cf6ede36d 100644 --- a/pkg/blockbuilder/tsdb.go +++ b/pkg/blockbuilder/tsdb.go @@ -50,7 +50,7 @@ type TSDBBuilder struct { var softErrProcessor = mimir_storage.NewSoftAppendErrorProcessor( func() {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, - func() {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, + func([]mimirpb.LabelAdapter) {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, ) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go new file mode 100644 index 00000000000..41b5f578f70 --- /dev/null +++ b/pkg/costattribution/manager.go @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "context" + "slices" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + + "github.com/grafana/mimir/pkg/util/validation" +) + +const ( + trackerLabel = "tracker" + tenantLabel = "tenant" + defaultTrackerName = "cost-attribution" + missingValue = "__missing__" + overflowValue = "__overflow__" +) + +type Manager struct { + services.Service + logger log.Logger + inactiveTimeout time.Duration + limits *validation.Overrides + + mtx sync.RWMutex + trackersByUserID map[string]*Tracker + reg *prometheus.Registry + cleanupInterval time.Duration +} + +func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { + m := &Manager{ + trackersByUserID: make(map[string]*Tracker), + limits: limits, + mtx: sync.RWMutex{}, + inactiveTimeout: inactiveTimeout, + logger: logger, + reg: reg, + cleanupInterval: cleanupInterval, + } + + m.Service = services.NewTimerService(cleanupInterval, nil, m.iteration, nil).WithName("cost attribution manager") + if err := reg.Register(m); err != nil { + return nil, err + } + return m, nil +} + +func (m *Manager) iteration(_ context.Context) error { + return m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout)) +} + +func (m *Manager) enabledForUser(userID string) bool { + if m == nil { + return false + } + return len(m.limits.CostAttributionLabels(userID)) > 0 +} + +func (m *Manager) Tracker(userID string) *Tracker { + if !m.enabledForUser(userID) { + return nil + } + + // Check if the tracker already exists, if exists return it. Otherwise lock and create a new tracker. + m.mtx.RLock() + tracker, exists := m.trackersByUserID[userID] + m.mtx.RUnlock() + if exists { + return tracker + } + + // We need to create a new tracker, get all the necessary information from the limits before locking and creating the tracker. + labels := m.limits.CostAttributionLabels(userID) + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + cooldownDuration := m.limits.CostAttributionCooldown(userID) + + m.mtx.Lock() + defer m.mtx.Unlock() + if tracker, exists = m.trackersByUserID[userID]; exists { + return tracker + } + tracker = newTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) + m.trackersByUserID[userID] = tracker + return tracker +} + +func (m *Manager) Collect(out chan<- prometheus.Metric) { + m.mtx.RLock() + defer m.mtx.RUnlock() + for _, tracker := range m.trackersByUserID { + tracker.Collect(out) + } +} + +func (m *Manager) Describe(chan<- *prometheus.Desc) { + // Describe is not implemented because the metrics include dynamic labels. The Manager functions as an unchecked exporter. + // For more details, refer to the documentation: https://pkg.go.dev/github.com/prometheus/client_golang/prometheus#hdr-Custom_Collectors_and_constant_Metrics +} + +func (m *Manager) deleteTracker(userID string) { + m.mtx.Lock() + defer m.mtx.Unlock() + delete(m.trackersByUserID, userID) +} + +func (m *Manager) updateTracker(userID string) *Tracker { + if !m.enabledForUser(userID) { + m.deleteTracker(userID) + return nil + } + + t := m.Tracker(userID) + lbls := slices.Clone(m.limits.CostAttributionLabels(userID)) + + // sort the labels to ensure the order is consistent + slices.Sort(lbls) + + // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker + if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != m.limits.CostAttributionCooldown(userID) { + m.mtx.Lock() + t = newTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = t + m.mtx.Unlock() + return t + } + + return t +} + +func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { + m.mtx.RLock() + userIDs := make([]string, 0, len(m.trackersByUserID)) + for userID := range m.trackersByUserID { + userIDs = append(userIDs, userID) + } + m.mtx.RUnlock() + + for _, userID := range userIDs { + t := m.updateTracker(userID) + if t == nil { + continue + } + + invalidKeys := t.inactiveObservations(deadline) + for _, key := range invalidKeys { + t.cleanupTrackerAttribution(key) + } + + if t.recoveredFromOverflow(deadline) { + // We delete the current tracker here, + // this will cause the creation of a new one later. + // ActiveSeries tracker compares the pointer of the tracker, + // and this change will cause a reload there. + m.deleteTracker(userID) + } + } + return nil +} diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go new file mode 100644 index 00000000000..de6a17c0512 --- /dev/null +++ b/pkg/costattribution/manager_test.go @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + + "github.com/grafana/mimir/pkg/costattribution/testutils" + "github.com/grafana/mimir/pkg/mimirpb" +) + +func newTestManager() *Manager { + logger := log.NewNopLogger() + limits, _ := testutils.NewMockCostAttributionLimits(0) + reg := prometheus.NewRegistry() + manager, err := NewManager(5*time.Second, 10*time.Second, logger, limits, reg) + if err != nil { + panic(err) + } + return manager +} + +func TestManager_New(t *testing.T) { + manager := newTestManager() + assert.NotNil(t, manager) + assert.NotNil(t, manager.trackersByUserID) + assert.Equal(t, 10*time.Second, manager.inactiveTimeout) +} + +func TestManager_CreateDeleteTracker(t *testing.T) { + manager := newTestManager() + + t.Run("Tracker existence and attributes", func(t *testing.T) { + user1Tracker := manager.Tracker("user1") + assert.NotNil(t, user1Tracker) + assert.True(t, user1Tracker.hasSameLabels([]string{"team"})) + assert.Equal(t, 5, user1Tracker.maxCardinality) + + assert.Nil(t, manager.Tracker("user2")) + + user3Tracker := manager.Tracker("user3") + assert.NotNil(t, user3Tracker) + assert.True(t, user3Tracker.hasSameLabels([]string{"department", "service"})) + assert.Equal(t, 2, user3Tracker.maxCardinality) + }) + + t.Run("Metrics tracking", func(t *testing.T) { + manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}}, 1, "invalid-metrics-name", time.Unix(6, 0)) + manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"department", "foo", "service", "dodo"}, SamplesCount: 1}}), time.Unix(20, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="bar",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Purge inactive attributions", func(t *testing.T) { + err := manager.purgeInactiveAttributionsUntil(time.Unix(10, 0)) + assert.NoError(t, err) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Disabling user cost attribution", func(t *testing.T) { + var err error + manager.limits, err = testutils.NewMockCostAttributionLimits(1) + assert.NoError(t, err) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0))) + assert.Equal(t, 1, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Updating user cardinality and labels", func(t *testing.T) { + var err error + manager.limits, err = testutils.NewMockCostAttributionLimits(2) + assert.NoError(t, err) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0))) + assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.True(t, manager.Tracker("user3").hasSameLabels([]string{"feature", "team"})) + + manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(13, 0)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{feature="__missing__",reason="invalid-metrics-name",team="foo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "bar", "feature", "bar"}, SamplesCount: 1}}), time.Unix(15, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "baz", "feature", "baz"}, SamplesCount: 1}}), time.Unix(16, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo", "feature", "foo"}, SamplesCount: 1}}), time.Unix(17, 0)) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="cost-attribution"} 2 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) +} + +func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { + manager := newTestManager() + + manager.Tracker("user1").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo"}, SamplesCount: 1}}), time.Unix(1, 0)) + manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}}, 1, "out-of-window", time.Unix(10, 0)) + + t.Run("Purge before inactive timeout", func(t *testing.T) { + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0))) + assert.Equal(t, 2, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Purge after inactive timeout", func(t *testing.T) { + // disable cost attribution for user1 to test purging + manager.limits, _ = testutils.NewMockCostAttributionLimits(1) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) + + // User3's tracker should remain since it's active, user1's tracker should be removed + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") + assert.Nil(t, manager.Tracker("user1"), "Expected user1 tracker to be purged") + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Purge all trackers", func(t *testing.T) { + // Trigger a purge that should remove all inactive trackers + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(20, 0))) + + // Tracker would stay at 1 since user1's tracker is disabled + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") + + // No metrics should remain after all purged + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) +} diff --git a/pkg/costattribution/testutils/test_utils.go b/pkg/costattribution/testutils/test_utils.go new file mode 100644 index 00000000000..62dc617e04b --- /dev/null +++ b/pkg/costattribution/testutils/test_utils.go @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package testutils + +import ( + "github.com/grafana/mimir/pkg/mimirpb" + "github.com/grafana/mimir/pkg/util/validation" +) + +func NewMockCostAttributionLimits(idx int, lvs ...string) (*validation.Overrides, error) { + baseLimits := map[string]*validation.Limits{ + "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, + "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, + "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, + "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, + "user5": {MaxCostAttributionCardinalityPerUser: 10, CostAttributionLabels: []string{"a"}}, + } + if len(lvs) > 0 { + baseLimits[lvs[0]] = &validation.Limits{ + MaxCostAttributionCardinalityPerUser: 10, + CostAttributionLabels: lvs[1:], + } + } + switch idx { + case 1: + baseLimits["user1"].CostAttributionLabels = []string{} + case 2: + baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} + case 3: + baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 + case 4: + baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 + case 5: + baseLimits["user1"].CostAttributionLabels = []string{"department"} + } + + return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) +} + +type Series struct { + LabelValues []string + SamplesCount int +} + +func CreateRequest(data []Series) *mimirpb.WriteRequest { + timeSeries := make([]mimirpb.PreallocTimeseries, 0, len(data)) + for i := 0; i < len(data); i++ { + var Labels []mimirpb.LabelAdapter + for j := 0; j+1 < len(data[i].LabelValues); j += 2 { + Labels = append(Labels, mimirpb.LabelAdapter{Name: data[i].LabelValues[j], Value: data[i].LabelValues[j+1]}) + } + timeSeries = append(timeSeries, mimirpb.PreallocTimeseries{ + TimeSeries: &mimirpb.TimeSeries{ + Labels: Labels, + Samples: make([]mimirpb.Sample, data[i].SamplesCount), + }, + }) + } + return &mimirpb.WriteRequest{Timeseries: timeSeries} +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go new file mode 100644 index 00000000000..a00b7fd4fc4 --- /dev/null +++ b/pkg/costattribution/tracker.go @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "bytes" + "slices" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + "go.uber.org/atomic" + + "github.com/grafana/mimir/pkg/mimirpb" +) + +const sep = rune(0x80) + +type observation struct { + lastUpdate atomic.Int64 + activeSerie atomic.Float64 + receivedSample atomic.Float64 + discardedSampleMtx sync.Mutex + discardedSample map[string]*atomic.Float64 + totalDiscarded atomic.Float64 +} + +type Tracker struct { + userID string + labels []string + maxCardinality int + activeSeriesPerUserAttribution *prometheus.Desc + receivedSamplesAttribution *prometheus.Desc + discardedSampleAttribution *prometheus.Desc + overflowLabels []string + observed map[string]*observation + observedMtx sync.RWMutex + hashBuffer []byte + isOverflow atomic.Bool + overflowCounter *observation + totalFailedActiveSeries *atomic.Float64 + cooldownDuration time.Duration + cooldownUntil time.Time + logger log.Logger +} + +func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *Tracker { + orderedLables := slices.Clone(trackedLabels) + slices.Sort(orderedLables) + + // Create a map for overflow labels to export when overflow happens + overflowLabels := make([]string, len(orderedLables)+2) + for i := range orderedLables { + overflowLabels[i] = overflowValue + } + + overflowLabels[len(orderedLables)] = userID + overflowLabels[len(orderedLables)+1] = overflowValue + + tracker := &Tracker{ + userID: userID, + labels: orderedLables, + maxCardinality: limit, + observed: make(map[string]*observation), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: cooldown, + logger: logger, + overflowLabels: overflowLabels, + totalFailedActiveSeries: atomic.NewFloat64(0), + } + + variableLabels := slices.Clone(orderedLables) + variableLabels = append(variableLabels, tenantLabel, "reason") + tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", + "The total number of samples that were discarded per attribution.", + variableLabels, + prometheus.Labels{trackerLabel: defaultTrackerName}) + + tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", + "The total number of samples that were received per attribution.", + variableLabels[:len(variableLabels)-1], + prometheus.Labels{trackerLabel: defaultTrackerName}) + + tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", + "The total number of active series per user and attribution.", variableLabels[:len(variableLabels)-1], + prometheus.Labels{trackerLabel: defaultTrackerName}) + + return tracker +} + +func (t *Tracker) hasSameLabels(labels []string) bool { + return slices.Equal(t.labels, labels) +} + +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +func (t *Tracker) cleanupTrackerAttribution(key string) { + t.observedMtx.Lock() + defer t.observedMtx.Unlock() + delete(t.observed, key) +} + +func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now, 1, 0, 0, nil, true) +} + +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { + if t == nil { + return + } + t.updateCounters(lbs, time.Time{}, -1, 0, 0, nil, false) +} + +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + + if t.isOverflow.Load() { + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) + return + } + + // Collect metrics for all observed keys + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() + for key, o := range t.observed { + if key == "" { + continue + } + keys := strings.Split(key, string(sep)) + + keys = append(keys, t.userID) + if o.activeSerie.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) + } + if o.receivedSample.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) + } + o.discardedSampleMtx.Lock() + for reason, discarded := range o.discardedSample { + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) + } + o.discardedSampleMtx.Unlock() + } +} + +func (t *Tracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { + if t == nil { + return + } + t.updateCountersWithLabelAdapter(lbs, now, 0, 0, value, &reason, true) +} + +func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { + if t == nil { + return + } + + dict := make(map[string]int) + for _, ts := range req.Timeseries { + lvs := t.extractLabelValuesFromLabelAdapater(ts.Labels) + dict[t.hashLabelValues(lvs)] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) + } + + // Update the observations for each label set and update the state per request, + // this would be less precised than per sample but it's more efficient + var total float64 + for k, v := range dict { + count := float64(v) + t.updateObservations(k, now.Unix(), 0, count, 0, nil, true) + total += count + } + t.updateState(now, 0, total, 0) +} + +func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + labelValues := t.extractLabelValuesFromLabelAdapater(lbls) + key := t.hashLabelValues(labelValues) + t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) +} + +func (t *Tracker) hashLabelValues(labelValues []string) string { + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + + for i, value := range labelValues { + if i > 0 { + buf.WriteRune(sep) + } + buf.WriteString(value) + } + return buf.String() +} + +func (t *Tracker) extractLabelValuesFromLabelAdapater(lbls []mimirpb.LabelAdapter) []string { + labelValues := make([]string, len(t.labels)) + for idx, cal := range t.labels { + for _, l := range lbls { + if l.Name == cal { + labelValues[idx] = l.Value + break + } + } + if labelValues[idx] == "" { + labelValues[idx] = missingValue + } + } + return labelValues +} + +func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + labelValues := make([]string, len(t.labels)) + for idx, cal := range t.labels { + labelValues[idx] = lbls.Get(cal) + if labelValues[idx] == "" { + labelValues[idx] = missingValue + } + } + key := t.hashLabelValues(labelValues) + t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) +} + +// updateObservations updates or creates a new observation in the 'observed' map. +func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + t.observedMtx.RLock() + o, known := t.observed[key] + t.observedMtx.RUnlock() + + if !known { + if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { + // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call + // Otherwise create a new observation for the key + t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + } + return + } + + o.lastUpdate.Store(ts) + if activeSeriesIncrement != 0 { + o.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + o.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 && reason != nil { + o.discardedSampleMtx.Lock() + if _, ok := o.discardedSample[*reason]; ok { + o.discardedSample[*reason].Add(discardedSampleIncrement) + } else { + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + } + o.discardedSampleMtx.Unlock() + } +} + +// updateState checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. +func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { + previousOverflow := true + t.observedMtx.RLock() + // Transition to overflow mode if maximum cardinality is exceeded. + if !t.isOverflow.Load() && len(t.observed) > t.maxCardinality { + // Make sure that we count current overflow only when state is switched to overflow from normal. + previousOverflow = t.isOverflow.Swap(true) + if !previousOverflow { + // Initialize the overflow counter. + t.overflowCounter = &observation{} + + // Aggregate active series from all keys into the overflow counter. + for _, o := range t.observed { + if o != nil { + t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) + } + } + t.cooldownUntil = ts.Add(t.cooldownDuration) + } + } + t.observedMtx.RUnlock() + + if t.isOverflow.Load() { + // if already in overflow mode, update the overflow counter. If it was normal mode, the active series are already applied. + if previousOverflow && activeSeriesIncrement != 0 { + t.overflowCounter.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 { + t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + } + } +} + +// createNewObservation creates a new observation in the 'observed' map. +func (t *Tracker) createNewObservation(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + t.observedMtx.Lock() + defer t.observedMtx.Unlock() + if _, exists := t.observed[key]; exists { + return + } + + t.observed[key] = &observation{ + lastUpdate: *atomic.NewInt64(ts), + activeSerie: *atomic.NewFloat64(activeSeriesIncrement), + receivedSample: *atomic.NewFloat64(receivedSampleIncrement), + discardedSample: make(map[string]*atomic.Float64), + discardedSampleMtx: sync.Mutex{}, + } + if discardedSampleIncrement > 0 && reason != nil { + t.observed[key].discardedSampleMtx.Lock() + t.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[key].discardedSampleMtx.Unlock() + } +} + +func (t *Tracker) recoveredFromOverflow(deadline time.Time) bool { + t.observedMtx.RLock() + if !t.cooldownUntil.IsZero() && t.cooldownUntil.Before(deadline) { + if len(t.observed) <= t.maxCardinality { + t.observedMtx.RUnlock() + return true + } + t.observedMtx.RUnlock() + + // Increase the cooldown duration if the number of observations is still above the max cardinality + t.observedMtx.Lock() + if len(t.observed) <= t.maxCardinality { + t.observedMtx.Unlock() + return true + } + t.cooldownUntil = deadline.Add(t.cooldownDuration) + t.observedMtx.Unlock() + } else { + t.observedMtx.RUnlock() + } + return false +} + +func (t *Tracker) inactiveObservations(deadline time.Time) []string { + // otherwise, we need to check all observations and clean up the ones that are inactive + var invalidKeys []string + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() + for labkey, ob := range t.observed { + if ob != nil && ob.lastUpdate.Load() <= deadline.Unix() { + invalidKeys = append(invalidKeys, labkey) + } + } + + return invalidKeys +} diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go new file mode 100644 index 00000000000..b00462ca3b9 --- /dev/null +++ b/pkg/costattribution/tracker_test.go @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "sync" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/grafana/mimir/pkg/costattribution/testutils" + "github.com/grafana/mimir/pkg/mimirpb" +) + +func TestTracker_hasSameLabels(t *testing.T) { + tracker := newTestManager().Tracker("user1") + assert.True(t, tracker.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") +} + +func TestTracker_IncrementReceviedSamples(t *testing.T) { + tManager := newTestManager() + tracker := tManager.Tracker("user4") + t.Run("One Single Series in Request", func(t *testing.T) { + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}}), time.Unix(10, 0)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 3 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) + t.Run("Multiple Different Series in Request", func(t *testing.T) { + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ + {LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}, + {LabelValues: []string{"platform", "bar", "service", "yoyo"}, SamplesCount: 5}, + }), time.Unix(20, 0)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 6 + cortex_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) + + t.Run("Multiple Series in Request with Same Labels", func(t *testing.T) { + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ + {LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}, + {LabelValues: []string{"platform", "foo", "service", "yoyo"}, SamplesCount: 5}, + }), time.Unix(30, 0)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 14 + cortex_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) +} + +func TestTracker_CreateDelete(t *testing.T) { + tManager := newTestManager() + tracker := tManager.Tracker("user4") + + tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + tracker.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) + tracker.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) + tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 + ` + + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + "cortex_ingester_attributed_active_series", + } + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(time.Unix(5, 0))) + assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) + + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) + tManager.deleteTracker("user4") + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(""), metricNames...)) +} + +func TestTracker_updateCounters(t *testing.T) { + tracker := newTestManager().Tracker("user3") + lbls1 := labels.FromStrings("department", "foo", "service", "bar") + lbls2 := labels.FromStrings("department", "bar", "service", "baz") + lbls3 := labels.FromStrings("department", "baz", "service", "foo") + + tracker.updateCounters(lbls1, time.Unix(1, 0), 1, 0, 0, nil, true) + assert.False(t, tracker.isOverflow.Load(), "First observation, should not overflow") + + tracker.updateCounters(lbls2, time.Unix(2, 0), 1, 0, 0, nil, true) + assert.False(t, tracker.isOverflow.Load(), "Second observation, should not overflow") + + tracker.updateCounters(lbls3, time.Unix(3, 0), 1, 0, 0, nil, true) + assert.True(t, tracker.isOverflow.Load(), "Third observation, should overflow") + + tracker.updateCounters(lbls3, time.Unix(4, 0), 1, 0, 0, nil, true) + assert.True(t, tracker.isOverflow.Load(), "Fourth observation, should stay overflow") + + assert.Equal(t, time.Unix(3, 0).Add(tracker.cooldownDuration), tracker.cooldownUntil, "CooldownUntil should be updated correctly") +} + +func TestTracker_inactiveObservations(t *testing.T) { + // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. + tracker := newTestManager().Tracker("user1") + + // Create two observations with different last update timestamps. + observations := [][]mimirpb.LabelAdapter{ + {{Name: "team", Value: "foo"}}, + {{Name: "team", Value: "bar"}}, + {{Name: "team", Value: "baz"}}, + } + + // Simulate samples discarded with different timestamps. + tracker.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + tracker.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) + tracker.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) + + // Ensure that two observations were successfully added to the tracker. + require.Len(t, tracker.observed, 3) + + // Purge observations that haven't been updated in the last 10 seconds. + purged := tracker.inactiveObservations(time.Unix(0, 0)) + require.Len(t, purged, 0) + + purged = tracker.inactiveObservations(time.Unix(10, 0)) + assert.ElementsMatch(t, []string{"foo"}, purged) + + purged = tracker.inactiveObservations(time.Unix(15, 0)) + assert.ElementsMatch(t, []string{"foo", "bar"}, purged) + + // Check that the purged observation matches the expected details. + purged = tracker.inactiveObservations(time.Unix(25, 0)) + assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) +} + +func TestTracker_Concurrency(t *testing.T) { + m := newTestManager() + tracker := m.Tracker("user1") + + var wg sync.WaitGroup + var i int64 + for i = 0; i < 100; i++ { + wg.Add(1) + go func(i int64) { + defer wg.Done() + lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) + tracker.updateCounters(lbls, time.Unix(i, 0), 1, 0, 0, nil, true) + }(i) + } + wg.Wait() + + // Verify no data races or inconsistencies + assert.True(t, len(tracker.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(tracker.observed), 2*tracker.maxCardinality, "Observed count should not exceed 2 times of max cardinality") + assert.True(t, tracker.isOverflow.Load(), "Expected state to be Overflow") + + expectedMetrics := ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{team="__overflow__",tenant="user1",tracker="cost-attribution"} 100 +` + assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) +} diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 41bf16ddb43..2a1d376ed90 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -48,6 +48,7 @@ import ( "golang.org/x/sync/errgroup" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" @@ -112,6 +113,7 @@ type Distributor struct { distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker haTracker @@ -328,7 +330,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -349,6 +351,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove ingesterPool: NewPool(cfg.PoolConfig, ingestersRing, cfg.IngesterClientFactory, log), healthyInstancesCount: atomic.NewUint32(0), limits: limits, + costAttributionMgr: costAttributionMgr, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -742,13 +745,15 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. + func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Samples) == 0 { return nil } + cat := d.costAttributionMgr.Tracker(userID) if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0]) + return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -762,7 +767,7 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } @@ -787,8 +792,9 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim return nil } + cat := d.costAttributionMgr.Tracker(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { return err } @@ -809,7 +815,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) if err != nil { return err } @@ -873,7 +879,8 @@ func (d *Distributor) validateExemplars(ts *mimirpb.PreallocTimeseries, userID s // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) (bool, error) { - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation); err != nil { + cat := d.costAttributionMgr.Tracker(userID) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } @@ -966,7 +973,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), time.Now()) + now := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), now) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -980,6 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) + d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1237,6 +1246,9 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { + if len(req.Timeseries) > 0 { + d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(validatedSamples), reasonRateLimited, now) + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1821,6 +1833,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) } + d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(req, mtime.Now()) receivedMetadata = len(req.Metadata) d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index ff63b09baa3..163ecf09721 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -54,6 +54,7 @@ import ( "google.golang.org/grpc/metadata" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -2114,7 +2115,7 @@ func mkLabels(n int, extra ...string) []mimirpb.LabelAdapter { ret[i+1] = mimirpb.LabelAdapter{Name: fmt.Sprintf("name_%d", i), Value: fmt.Sprintf("value_%d", i)} } for i := 0; i < len(extra); i += 2 { - ret[i+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} + ret[i/2+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} } slices.SortFunc(ret, func(a, b mimirpb.LabelAdapter) int { switch { @@ -2147,7 +2148,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2168,7 +2169,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2188,7 +2189,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(31) + metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2209,7 +2210,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long name. - metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx") + metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx", "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2230,7 +2231,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long value. - metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1)) + metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1), "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2250,7 +2251,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().Add(time.Hour).UnixNano() / int64(time.Millisecond), @@ -2261,7 +2262,7 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a sample whose timestamp is too far in the future", }, - "all samples go to metric_relabel_configs": { + "all samples go to metric relabel configs": { prepareConfig: func(limits *validation.Limits) { limits.MetricRelabelConfigs = []*relabel.Config{ { @@ -2278,7 +2279,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2291,78 +2292,110 @@ func BenchmarkDistributor_Push(b *testing.B) { }, } - for testName, testData := range tests { - b.Run(testName, func(b *testing.B) { - // Create an in-memory KV store for the ring with 1 ingester registered. - kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) - b.Cleanup(func() { assert.NoError(b, closer.Close()) }) + costAttributionCases := []struct { + state string + customRegistry *prometheus.Registry + cfg func(limits *validation.Limits) + }{ + { + state: "disabled", + customRegistry: nil, + cfg: func(_ *validation.Limits) {}, + }, + { + state: "enabled", + customRegistry: prometheus.NewRegistry(), + cfg: func(limits *validation.Limits) { + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + }, + } - err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, - func(_ interface{}) (interface{}, bool, error) { - d := &ring.Desc{} - d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) - return d, true, nil - }, - ) - require.NoError(b, err) - - ingestersRing, err := ring.New(ring.Config{ - KVStore: kv.Config{Mock: kvStore}, - HeartbeatTimeout: 60 * time.Minute, - ReplicationFactor: 1, - }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) - }) + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for testName, testData := range tests { + b.Run(fmt.Sprintf("scenario=%s", testName), func(b *testing.B) { + // Create an in-memory KV store for the ring with 1 ingester registered. + kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + b.Cleanup(func() { assert.NoError(b, closer.Close()) }) - test.Poll(b, time.Second, 1, func() interface{} { - return ingestersRing.InstancesCount() - }) + err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, + func(_ interface{}) (interface{}, bool, error) { + d := &ring.Desc{} + d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) + return d, true, nil + }, + ) + require.NoError(b, err) + + ingestersRing, err := ring.New(ring.Config{ + KVStore: kv.Config{Mock: kvStore}, + HeartbeatTimeout: 60 * time.Minute, + ReplicationFactor: 1, + }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) + }) - // Prepare the distributor configuration. - var distributorCfg Config - var clientConfig client.Config - limits := validation.Limits{} - flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) - distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" + test.Poll(b, time.Second, 1, func() interface{} { + return ingestersRing.InstancesCount() + }) - limits.IngestionRate = float64(rate.Inf) // Unlimited. - testData.prepareConfig(&limits) + // Prepare the distributor configuration. + var distributorCfg Config + var clientConfig client.Config + limits := validation.Limits{} + flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) + distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" - distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { - return &noopIngester{}, nil - }) + limits.IngestionRate = float64(rate.Inf) // Unlimited. + testData.prepareConfig(&limits) - overrides, err := validation.NewOverrides(limits, nil) - require.NoError(b, err) + distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { + return &noopIngester{}, nil + }) - // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) + caCase.cfg(&limits) + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(b, err) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) - }) + // Initialize the cost attribution manager + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } - // Prepare the series to remote write before starting the benchmark. - metrics, samples := testData.prepareSeries() + // Start the distributor. + distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) - // Run the benchmark. - b.ReportAllocs() - b.ResetTimer() + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) + }) - for n := 0; n < b.N; n++ { - _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + // Prepare the series to remote write before starting the benchmark. + metrics, samples := testData.prepareSeries() - if testData.expectedErr == "" && err != nil { - b.Fatalf("no error expected but got %v", err) - } - if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { - b.Fatalf("expected %v error but got %v", testData.expectedErr, err) - } + // Run the benchmark. + b.ReportAllocs() + b.ResetTimer() + + for n := 0; n < b.N; n++ { + _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + + if testData.expectedErr == "" && err != nil { + b.Fatalf("no error expected but got %v", err) + } + if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { + b.Fatalf("expected %v error but got %v", testData.expectedErr, err) + } + } + }) } }) } @@ -5627,7 +5660,7 @@ func prepare(t testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []* require.NoError(t, err) reg := prometheus.NewPedanticRegistry() - d, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) + d, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, d)) t.Cleanup(func() { @@ -8263,7 +8296,7 @@ func TestCheckStartedMiddleware(t *testing.T) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - distributor, err := New(distributorConfig, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorConfig, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(t, err) ctx := user.InjectOrgID(context.Background(), "user") diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index bbb6717bc96..698da62eab1 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/extract" "github.com/grafana/mimir/pkg/util/globalerror" @@ -238,15 +239,17 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -257,20 +260,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -284,6 +290,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { + cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -291,6 +298,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { + cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -392,14 +400,16 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { + cat.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -408,11 +418,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -424,17 +436,22 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { + cat.IncrementDiscardedSamples(ls, 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabelValue, ts) m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, validUTF8Message(l.Value), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { + cat.IncrementDiscardedSamples(ls, 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { + cat.IncrementDiscardedSamples(ls, 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 0dc670d6e5c..47af497ca64 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -14,6 +14,7 @@ import ( "time" "unicode/utf8" + "github.com/go-kit/log" "github.com/gogo/protobuf/proto" "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/httpgrpc" @@ -25,6 +26,8 @@ import ( grpcstatus "google.golang.org/grpc/status" golangproto "google.golang.org/protobuf/proto" + "github.com/grafana/mimir/pkg/costattribution" + catestutils "github.com/grafana/mimir/pkg/costattribution/testutils" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/validation" ) @@ -66,6 +69,7 @@ func (vm validateMetadataCfg) MaxMetadataLength(_ string) int { } func TestValidateLabels(t *testing.T) { + ts := time.Now() reg := prometheus.NewPedanticRegistry() s := newSampleValidationMetrics(reg) @@ -74,8 +78,13 @@ func TestValidateLabels(t *testing.T) { cfg.maxLabelValueLength = 25 cfg.maxLabelNameLength = 25 - cfg.maxLabelNamesPerSeries = 2 - cfg.maxLabelNamesPerInfoSeries = 3 + cfg.maxLabelNamesPerSeries = 3 + cfg.maxLabelNamesPerInfoSeries = 4 + limits, _ := catestutils.NewMockCostAttributionLimits(0, userID, "team") + careg := prometheus.NewRegistry() + manager, err := costattribution.NewManager(5*time.Second, 10*time.Second, log.NewNopLogger(), limits, careg) + require.NoError(t, err) + cat := manager.Tracker(userID) for _, c := range []struct { metric model.Metric @@ -84,25 +93,25 @@ func TestValidateLabels(t *testing.T) { err error }{ { - metric: map[model.LabelName]model.LabelValue{}, + metric: map[model.LabelName]model.LabelValue{"team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: errors.New(noMetricNameMsgFormat), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: " "}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: " ", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf(invalidMetricNameMsgFormat, " "), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "metric_name_with_\xb0_invalid_utf8_\xb0"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "metric_name_with_\xb0_invalid_utf8_\xb0", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf(invalidMetricNameMsgFormat, "metric_name_with__invalid_utf8_ (non-ascii characters removed)"), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid", "foo ": "bar"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid", "foo ": "bar", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -112,18 +121,19 @@ func TestValidateLabels(t *testing.T) { []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "valid"}, {Name: "foo ", Value: "bar"}, + {Name: "team", Value: "a"}, }, ), ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid", "team": "c"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelName", "this_is_a_really_really_long_name_that_should_cause_an_error": "test_value_please_ignore"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelName", "this_is_a_really_really_long_name_that_should_cause_an_error": "test_value_please_ignore", "team": "biz"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -131,6 +141,7 @@ func TestValidateLabels(t *testing.T) { "this_is_a_really_really_long_name_that_should_cause_an_error", mimirpb.FromLabelAdaptersToString( []mimirpb.LabelAdapter{ + {Name: "team", Value: "biz"}, {Name: model.MetricNameLabel, Value: "badLabelName"}, {Name: "this_is_a_really_really_long_name_that_should_cause_an_error", Value: "test_value_please_ignore"}, }, @@ -138,7 +149,7 @@ func TestValidateLabels(t *testing.T) { ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelValue", "much_shorter_name": "test_value_please_ignore_no_really_nothing_to_see_here"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelValue", "much_shorter_name": "test_value_please_ignore_no_really_nothing_to_see_here", "team": "biz"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -149,12 +160,13 @@ func TestValidateLabels(t *testing.T) { []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "badLabelValue"}, {Name: "much_shorter_name", Value: "test_value_please_ignore_no_really_nothing_to_see_here"}, + {Name: "team", Value: "biz"}, }, ), ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop", "team": "plof"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -164,21 +176,22 @@ func TestValidateLabels(t *testing.T) { {Name: model.MetricNameLabel, Value: "foo"}, {Name: "bar", Value: "baz"}, {Name: "blip", Value: "blop"}, + {Name: "team", Value: "plof"}, }, - 2, + 3, )..., ), }, { // *_info metrics have higher label limits. - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: nil, }, { // *_info metrics have higher label limits. - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "blap": "blup"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "blap": "blup", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -189,31 +202,32 @@ func TestValidateLabels(t *testing.T) { {Name: "bar", Value: "baz"}, {Name: "blip", Value: "blop"}, {Name: "blap", Value: "blup"}, + {Name: "team", Value: "a"}, }, - 3, + 4, )..., ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: true, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "invalid%label&name": "bar"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "invalid%label&name": "bar", "team": "biz"}, skipLabelNameValidation: true, skipLabelCountValidation: false, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "你好"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "你好", "team": "plof"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "abc\xfe\xfddef"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "abc\xfe\xfddef", "team": "plof"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -228,7 +242,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, cat, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -249,6 +263,19 @@ func TestValidateLabels(t *testing.T) { cortex_discarded_samples_total{group="custom label",reason="random reason",user="different user"} 1 `), "cortex_discarded_samples_total")) + require.NoError(t, testutil.GatherAndCompare(careg, strings.NewReader(` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="label_invalid",team="a",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="label_name_too_long",team="biz",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="label_value_invalid",team="plof",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="label_value_too_long",team="biz",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="max_label_names_per_info_series",team="a",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="max_label_names_per_series",team="plof",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="metric_name_invalid",team="a",tenant="testUser",tracker="cost-attribution"} 2 + cortex_discarded_attributed_samples_total{reason="missing_metric_name",team="a",tenant="testUser",tracker="cost-attribution"} 1 +`), "cortex_discarded_attributed_samples_total")) + s.deleteUserMetrics(userID) require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` @@ -422,17 +449,17 @@ func TestValidateMetadata(t *testing.T) { } func TestValidateLabelDuplication(t *testing.T) { + ts := time.Now() var cfg validateLabelsCfg cfg.maxLabelNameLength = 10 cfg.maxLabelNamesPerSeries = 10 cfg.maxLabelValueLength = 10 userID := "testUser" - actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false) + }, false, false, nil, ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -449,7 +476,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false) + }, false, false, nil, ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -600,7 +627,6 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { registry := prometheus.NewRegistry() metrics := newSampleValidationMetrics(registry) - for _, limit := range []int{0, 1, 2} { for name, h := range testCases { t.Run(fmt.Sprintf("limit-%d-%s", limit, name), func(t *testing.T) { @@ -608,7 +634,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) if limit == 1 { require.Error(t, err) @@ -655,7 +681,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index aa7f928d7dd..6fdf3e00bc4 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) memPostings := index.NewMemPostings() for i, l := range series { @@ -51,10 +51,10 @@ func TestIsLabelValueActive(t *testing.T) { // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) require.True(t, valid) result, err := IsLabelValueActive(ctx, reader, activeSeries, "a", "1") diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 665f5787c61..2b95020c68d 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -34,10 +34,10 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { if i+1 == 3 || i+1 == 4 { buckets = 10 // Native histogram with 10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -62,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -70,10 +70,10 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { if i == 2 || i == 3 { buckets = i * 10 // Native histogram with i*10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 5, allActive) @@ -106,17 +106,18 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 if i+1 == 4 { buckets = -1 // Make ref==4 not a native histogram to check that Seek skips it. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -145,14 +146,15 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -181,14 +183,14 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a2345841d11..84c71634e72 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,13 +26,14 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -57,13 +58,14 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -88,13 +90,14 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 71044b5e348..a5261e17031 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -13,10 +13,12 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -44,10 +46,12 @@ type ActiveSeries struct { stripes [numStripes]seriesStripe deleted deletedSeries - // matchersMutex protects matchers and lastMatchersUpdate. - matchersMutex sync.RWMutex - matchers *asmodel.Matchers - lastMatchersUpdate time.Time + // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat + configMutex sync.RWMutex + matchers *asmodel.Matchers + lastConfigUpdate time.Time + + cat *costattribution.Tracker // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -63,16 +67,18 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 + oldestEntryTs atomic.Int64 + mu sync.RWMutex + refs map[storage.SeriesRef]seriesEntry + activeSeriesAttributionFailureCounter atomic.Float64 + active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. + activeMatching []uint32 // Number of active entries in this stripe matching each matcher of the configured Matchers. + activeNativeHistograms uint32 // Number of active entries (only native histograms) in this stripe. Only decreased during purge or clear. + activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. + activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. + activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - mu sync.RWMutex - refs map[storage.SeriesRef]seriesEntry - active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. - activeMatching []uint32 // Number of active entries in this stripe matching each matcher of the configured Matchers. - activeNativeHistograms uint32 // Number of active entries (only native histograms) in this stripe. Only decreased during purge or clear. - activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. - activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. - activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + cat *costattribution.Tracker } // seriesEntry holds a timestamp for single series. @@ -84,50 +90,50 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout} +func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.Tracker) *ActiveSeries { + c := &ActiveSeries{matchers: asm, timeout: timeout, cat: cat} // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, cat) } return c } func (c *ActiveSeries) CurrentMatcherNames() []string { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.matchers.MatcherNames() } +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { + c.configMutex.RLock() + defer c.configMutex.RUnlock() + return ctCfg.String() != c.matchers.Config().String() || caCfg != c.cat +} + func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() + c.configMutex.Lock() + defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm - c.lastMatchersUpdate = now -} - -func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() - return c.matchers.Config() + c.lastConfigUpdate = now } // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. -func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int) { +func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { deletedStripeID := deleted.ref % numStripes - c.stripes[deletedStripeID].remove(deleted.ref) + c.stripes[deletedStripeID].remove(deleted.ref, idx) } } } @@ -149,19 +155,19 @@ func (c *ActiveSeries) PostDeletion(deleted map[chunks.HeadSeriesRef]labels.Labe // Purge purges expired entries and returns true if enough time has passed since // last reload. This should be called periodically to avoid unbounded memory // growth. -func (c *ActiveSeries) Purge(now time.Time) bool { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() +func (c *ActiveSeries) Purge(now time.Time, idx tsdb.IndexReader) bool { + c.configMutex.Lock() + defer c.configMutex.Unlock() purgeTime := now.Add(-c.timeout) - c.purge(purgeTime) + c.purge(purgeTime, idx) - return !c.lastMatchersUpdate.After(purgeTime) + return !c.lastConfigUpdate.After(purgeTime) } // purge removes expired entries from the cache. -func (c *ActiveSeries) purge(keepUntil time.Time) { +func (c *ActiveSeries) purge(keepUntil time.Time, idx tsdb.IndexReader) { for s := 0; s < numStripes; s++ { - c.stripes[s].purge(keepUntil) + c.stripes[s].purge(keepUntil, idx) } } @@ -196,8 +202,8 @@ func (c *ActiveSeries) Active() (total, totalNativeHistograms, totalNativeHistog // of buckets in those active native histogram series. This method does not purge // expired entries, so Purge should be called periodically. func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, totalNativeHistograms int, totalMatchingNativeHistograms []int, totalNativeHistogramBuckets int, totalMatchingNativeHistogramBuckets []int) { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() totalMatching = make([]int, len(c.matchers.MatcherNames())) totalMatchingNativeHistograms = make([]int, len(c.matchers.MatcherNames())) @@ -212,9 +218,17 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } -func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { +func (c *ActiveSeries) ActiveSeriesAttributionFailureCount() float64 { + var total float64 + for s := 0; s < numStripes; s++ { + total += c.stripes[s].activeSeriesAttributionFailureCount() + } + return total +} + +func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef, idx tsdb.IndexReader) { stripeID := storage.SeriesRef(ref) % numStripes - c.stripes[stripeID].remove(storage.SeriesRef(ref)) + c.stripes[stripeID].remove(storage.SeriesRef(ref), idx) } func (c *ActiveSeries) Clear() { @@ -394,6 +408,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } + s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -415,10 +430,9 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries) { +func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.Tracker) { s.mu.Lock() defer s.mu.Unlock() - s.deleted = deleted s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} @@ -429,9 +443,10 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) + s.cat = cat } -func (s *seriesStripe) purge(keepUntil time.Time) { +func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { keepUntilNanos := keepUntil.UnixNano() if oldest := s.oldestEntryTs.Load(); oldest > 0 && keepUntilNanos <= oldest { // Nothing to do. @@ -449,9 +464,17 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) + buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { + if s.cat != nil { + if err := idx.Series(ref, &buf, nil); err != nil { + s.activeSeriesAttributionFailureCounter.Add(1) + } else { + s.cat.DecrementActiveSeries(buf.Labels()) + } + } if entry.deleted { s.deleted.purge(ref) } @@ -485,11 +508,18 @@ func (s *seriesStripe) purge(keepUntil time.Time) { } } +func (s *seriesStripe) activeSeriesAttributionFailureCount() float64 { + s.mu.Lock() + defer s.mu.Unlock() + + return s.activeSeriesAttributionFailureCounter.Swap(0) +} + // remove a single series from the stripe. // This is mostly the same logic from purge() but we decrement counters for a single entry instead of incrementing for each entry. // Note: we might remove the oldest series here, but the worst thing can happen is that we let run a useless purge() cycle later, // so this method doesn't update the oldestEntryTs. -func (s *seriesStripe) remove(ref storage.SeriesRef) { +func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.mu.Lock() defer s.mu.Unlock() @@ -502,6 +532,14 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- + if s.cat != nil { + buf := labels.NewScratchBuilder(128) + if err := idx.Series(ref, &buf, nil); err != nil { + s.activeSeriesAttributionFailureCounter.Add(1) + } else { + s.cat.DecrementActiveSeries(buf.Labels()) + } + } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index cf821c5bca5..690b6b6c71c 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -9,17 +9,25 @@ import ( "fmt" "math" "strconv" + "strings" "sync" "testing" "time" + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" + catestutils "github.com/grafana/mimir/pkg/costattribution/testutils" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -37,10 +45,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - - valid := c.Purge(time.Now()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -50,8 +56,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveBuckets) assert.Empty(t, activeMatchingBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -62,8 +68,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -74,8 +80,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -86,8 +92,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -98,8 +104,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 5, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -111,8 +117,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 8, allActiveBuckets) // more buckets for a histogram - c.UpdateSeries(ls3, ref3, time.Now(), 7) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 7, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -124,8 +130,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 10, allActiveBuckets) // changing a metric from histogram to float - c.UpdateSeries(ls4, ref4, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -150,7 +156,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -162,7 +168,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // ref5 is created with the same labelset as ls1, it shouldn't be accounted as different series. - c.UpdateSeries(ls1, ref5, time.Now(), -1) + c.UpdateSeries(ls1, ref5, time.Now(), -1, nil) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) assert.Equal(t, 1, allActiveHistograms) @@ -173,7 +179,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -204,19 +210,19 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // The expected number of series is the total number of series minus the ttl // because the first ttl series should be purged exp := len(series) - (ttl) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -229,9 +235,157 @@ func TestActiveSeries_ContainsRef(t *testing.T) { } } +type mockIndex struct { + mock.Mock + tsdb.IndexReader + existingLabels map[storage.SeriesRef]labels.Labels +} + +func (m *mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, _ *[]chunks.Meta) error { + if ls, ok := m.existingLabels[ref]; ok { + builder.Assign(ls) + return nil + } + return fmt.Errorf("no labels found for ref %d", ref) +} + +func TestActiveSeries_UpdateSeries_WithCostAttribution(t *testing.T) { + limits, _ := catestutils.NewMockCostAttributionLimits(0) + reg := prometheus.NewRegistry() + manager, err := costattribution.NewManager(5*time.Second, 10*time.Second, log.NewNopLogger(), limits, reg) + require.NoError(t, err) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, manager.Tracker("user5")) + testCostAttributionUpdateSeries(t, c, reg) +} + +func testCostAttributionUpdateSeries(t *testing.T, c *ActiveSeries, reg *prometheus.Registry) { + ref1, ls1 := storage.SeriesRef(1), labels.FromStrings("a", "1") + ref2, ls2 := storage.SeriesRef(2), labels.FromStrings("a", "2") + ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") + ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") + ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") + ref6 := storage.SeriesRef(6) // same as ls2 + ref7, ls7 := storage.SeriesRef(7), labels.FromStrings("a", "2", "b", "1") + idx := mockIndex{existingLabels: map[storage.SeriesRef]labels.Labels{ref1: ls1, ref2: ls2, ref3: ls3, ref4: ls4, ref5: ls5, ref7: ls7}} + valid := c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls1, ref1, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics := ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls2, ref2, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls3, ref3, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // ref7 has the same cost attribution labels as ref2, but it's a different series. + c.UpdateSeries(ls7, ref7, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 2 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls4, ref4, time.Now(), 3, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 2 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="4",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls5, ref5, time.Now(), 5, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 2 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="4",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="5",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // changing a metric from float to histogram + c.UpdateSeries(ls3, ref3, time.Now(), 6, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // fewer (zero) buckets for a histogram + c.UpdateSeries(ls4, ref4, time.Now(), 0, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // ref2 is deleted from the head, but still active. + c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ + chunks.HeadSeriesRef(ref2): ls2, + }) + // Numbers don't change. + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // Don't change after purging. + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // // ls2 is pushed again, this time with ref6 + c.UpdateSeries(ls2, ref6, time.Now(), -1, &idx) + // Numbers don't change. + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // Don't change after purging. + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // Make sure deleted is empty, so we're not leaking. + assert.Empty(t, c.deleted.refs) + assert.Empty(t, c.deleted.keys) +} + func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) } @@ -243,7 +397,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") ref6 := storage.SeriesRef(6) // same as ls2 - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -257,8 +411,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -272,8 +426,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -287,8 +441,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -302,8 +456,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -317,8 +471,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -332,8 +486,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 3, allActiveBuckets) - c.UpdateSeries(ls5, ref5, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls5, ref5, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -348,8 +502,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 8, allActiveBuckets) // changing a metric from float to histogram - c.UpdateSeries(ls3, ref3, time.Now(), 6) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 6, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -364,8 +518,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 14, allActiveBuckets) // fewer (zero) buckets for a histogram - c.UpdateSeries(ls4, ref4, time.Now(), 0) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 0, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -397,7 +551,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -412,7 +566,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // ls2 is pushed again, this time with ref6 - c.UpdateSeries(ls2, ref6, time.Now(), -1) + c.UpdateSeries(ls2, ref6, time.Now(), -1, nil) // Numbers don't change. allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -427,7 +581,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -448,7 +602,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) c.Clear() @@ -488,12 +642,11 @@ func labelsWithHashCollision() (labels.Labels, labels.Labels) { func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -517,22 +670,22 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ deletedRef: deletedLabels, }) - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) exp := len(series) - (ttl) // Purge is not intended to purge - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -563,13 +716,13 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute) + c := NewActiveSeries(asm, 5*time.Minute, nil) exp := len(series) - ttl expMatchingSeries := 0 for i, s := range series { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) // if this series is matching, and they're within the ttl tmp := asm.Matches(s) @@ -578,11 +731,11 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { } } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -596,28 +749,28 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) - c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 1, allActive) - c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) // This will *not* update the series, since there is already newer timestamp. - c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1) + c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -632,30 +785,30 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) assert.Equal(t, []int{1}, activeMatching) c.ReloadMatchers(asm, currentTime) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.False(t, valid) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls1, ref1, currentTime, -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -666,8 +819,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls3, ref3, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls3, ref3, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -681,8 +834,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls4, ref4, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls4, ref4, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -698,15 +851,15 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -717,10 +870,10 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -736,16 +889,15 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) currentTime := time.Now() - - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -757,11 +909,11 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -790,7 +942,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -824,7 +976,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo nextSeriesID = 0 } - c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1) + c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1, nil) } }(i) } @@ -841,7 +993,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo case <-stopPurge: return default: - c.Purge(future()) + c.Purge(future(), nil) } // Throttle, but keep high pressure from Purge(). @@ -928,10 +1080,10 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { - c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) + c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) now++ } } @@ -953,7 +1105,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} @@ -968,13 +1120,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Prepare series for ix, s := range series { if ix < numExpiresSeries { - c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1) + c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1, nil) } else { - c.UpdateSeries(s, refs[ix], currentTime, -1) + c.UpdateSeries(s, refs[ix], currentTime, -1, nil) } } - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(b, numSeries, allActive) @@ -982,13 +1134,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Purge is going to purge everything currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) if twice { - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index d5a934e781a..6debbca48df 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -52,6 +52,7 @@ import ( "go.uber.org/atomic" "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester/activeseries" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" @@ -314,6 +315,8 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService + costAttributionMgr *costattribution.Manager + tsdbMetrics *tsdbMetrics forceCompactTrigger chan requestWithUsersAndCallback @@ -382,7 +385,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -391,6 +394,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -783,10 +787,15 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() { + newCostAttributionTracker := i.costAttributionMgr.Tracker(userID) + if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - valid := userDB.activeSeries.Purge(now) + + idx := userDB.Head().MustIndex() + valid := userDB.activeSeries.Purge(now, idx) + idx.Close() + if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. i.metrics.activeSeriesLoading.WithLabelValues(userID).Set(1) @@ -809,6 +818,11 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.metrics.activeNativeHistogramBucketsPerUser.DeleteLabelValues(userID) } + AttributedActiveSeriesFailure := userDB.activeSeries.ActiveSeriesAttributionFailureCount() + if AttributedActiveSeriesFailure > 0 { + i.metrics.attributedActiveSeriesFailuresPerUser.WithLabelValues(userID).Add(AttributedActiveSeriesFailure) + } + for idx, name := range userDB.activeSeries.CurrentMatcherNames() { // We only set the metrics for matchers that actually exist, to avoid increasing cardinality with zero valued metrics. if activeMatching[idx] > 0 { @@ -1190,48 +1204,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, - func() { + func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1239,30 +1261,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1377,7 +1404,6 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, errProcessor *mimir_storage.SoftAppendErrorProcessor, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { - // Fetch limits once per push request both to avoid processing half the request differently. var ( nativeHistogramsIngestionEnabled = i.limits.NativeHistogramsIngestionEnabled(userID) @@ -1390,6 +1416,11 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels + + // idx is used to decrease active series count in case of error for cost attribution. + idx := i.getTSDB(userID).Head().MustIndex() + defer idx.Close() + for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1406,7 +1437,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) - + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1424,10 +1455,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // ignore native histograms in the condition and statitics as well if outOfOrderWindow <= 0 && minAppendTimeAvailable && len(ts.Exemplars) == 0 && len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { - stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -1548,7 +1578,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } if activeSeries != nil && stats.succeededSamplesCount > oldSucceededSamplesCount { - activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets) + activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets, idx) } if len(ts.Exemplars) > 0 && i.limits.MaxGlobalExemplarsPerUser(userID) > 0 { @@ -2643,7 +2673,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD userDB := &userTSDB{ userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout), + activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.Tracker(userID)), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3243,7 +3273,9 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now } // Purge the active series so that the next call to Active() will return the up-to-date count. - db.activeSeries.Purge(now) + idx := db.Head().MustIndex() + db.activeSeries.Purge(now, idx) + idx.Close() // Estimate the number of series that would be dropped from the TSDB Head if we would // compact the head up until "now - active series idle timeout". diff --git a/pkg/ingester/ingester_early_compaction_test.go b/pkg/ingester/ingester_early_compaction_test.go index dbb0fd944b7..649a7fc9dc3 100644 --- a/pkg/ingester/ingester_early_compaction_test.go +++ b/pkg/ingester/ingester_early_compaction_test.go @@ -129,7 +129,7 @@ func TestIngester_compactBlocksToReduceInMemorySeries_ShouldTriggerCompactionOnl require.Len(t, listBlocksInDir(t, userBlocksDir), 0) // Use a trick to track all series we've written so far as "inactive". - ingester.getTSDB(userID).activeSeries.Purge(now.Add(30 * time.Minute)) + ingester.getTSDB(userID).activeSeries.Purge(now.Add(30*time.Minute), nil) // Pre-condition check. require.Equal(t, uint64(10), ingester.getTSDB(userID).Head().NumSeries()) diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 4a529321155..fcf79dd4bc7 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -650,7 +650,7 @@ func createTestIngesterWithIngestStorage(t testing.TB, ingesterCfg *Config, over require.NoError(t, services.StopAndAwaitTerminated(ctx, prw)) }) - ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, reg, util_test.NewTestingLogger(t)) + ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, nil, reg, util_test.NewTestingLogger(t)) require.NoError(t, err) return ingester, kafkaCluster, prw diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 96ebf4f6165..829a6293c8a 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -60,6 +60,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -3589,53 +3590,114 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) { } func BenchmarkIngesterPush(b *testing.B) { - registry := prometheus.NewRegistry() - ctx := user.InjectOrgID(context.Background(), userID) + costAttributionCases := []struct { + state string + limitsCfg func(*validation.Limits) + customRegistry *prometheus.Registry + }{ + { + state: "enabled", + limitsCfg: func(*validation.Limits) {}, + customRegistry: nil, + }, + { + state: "disabled", + limitsCfg: func(limits *validation.Limits) { + if limits == nil { + return + } + limits.CostAttributionLabels = []string{"cpu"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + customRegistry: prometheus.NewRegistry(), + }, + } - // Create a mocked ingester - cfg := defaultIngesterTestConfig(b) + tests := []struct { + name string + limitsCfg func() validation.Limits + }{ + { + name: "ingester push succeeded", + limitsCfg: func() validation.Limits { + limitsCfg := defaultLimitsTestConfig() + limitsCfg.NativeHistogramsIngestionEnabled = true + return limitsCfg + }, + }, + } - ingester, err := prepareIngesterWithBlocksStorage(b, cfg, nil, registry) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - defer services.StopAndAwaitTerminated(context.Background(), ingester) //nolint:errcheck + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for _, t := range tests { + b.Run(fmt.Sprintf("scenario=%s", t.name), func(b *testing.B) { + registry := prometheus.NewRegistry() + ctx := user.InjectOrgID(context.Background(), userID) - // Wait until the ingester is healthy - test.Poll(b, 100*time.Millisecond, 1, func() interface{} { - return ingester.lifecycler.HealthyInstancesCount() - }) + // Create a mocked ingester + cfg := defaultIngesterTestConfig(b) - // Push a single time series to set the TSDB min time. - metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} - startTime := util.TimeToMillis(time.Now()) + limitCfg := t.limitsCfg() + caCase.limitsCfg(&limitCfg) - currTimeReq := mimirpb.ToWriteRequest( - metricLabelAdapters, - []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, - nil, - nil, - mimirpb.API, - ) - _, err = ingester.Push(ctx, currTimeReq) - require.NoError(b, err) + overrides, err := validation.NewOverrides(limitCfg, nil) + require.NoError(b, err) - const ( - series = 10 - samples = 1 - ) + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } + + ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - allLabels, allSamples := benchmarkData(series) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) + }) - b.ResetTimer() - for iter := 0; iter < b.N; iter++ { - // Bump the timestamp on each of our test samples each time round the loop - for j := 0; j < samples; j++ { - for i := range allSamples { - allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + // Wait until the ingester is healthy + test.Poll(b, 100*time.Millisecond, 1, func() interface{} { + return ingester.lifecycler.HealthyInstancesCount() + }) + + // Push a single time series to set the TSDB min time. + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + startTime := util.TimeToMillis(time.Now()) + + currTimeReq := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, + nil, + nil, + mimirpb.API, + ) + _, err = ingester.Push(ctx, currTimeReq) + require.NoError(b, err) + + // so we are benchmark 5000 series with 10 sample each + const ( + series = 5000 + samples = 10 + ) + + allLabels, allSamples := benchmarkData(series) + + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + // Bump the timestamp on each of our test samples each time round the loop + for j := 0; j < samples; j++ { + for i := range allSamples { + allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + } + _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + require.NoError(b, err) + } + } + }) } - _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) - require.NoError(b, err) - } + }) } } @@ -6232,10 +6294,14 @@ func prepareIngesterWithBlocksStorageAndLimits(t testing.TB, ingesterCfg Config, } func prepareIngesterWithBlockStorageAndOverrides(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { - return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer) + return prepareIngesterWithBlockStorageOverridesAndCostAttribution(t, ingesterCfg, overrides, ingestersRing, dataDir, bucketDir, registerer, nil) +} + +func prepareIngesterWithBlockStorageOverridesAndCostAttribution(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { + return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer, cam) } -func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { +func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { // Create a data dir if none has been provided. if dataDir == "" { dataDir = t.TempDir() @@ -6256,7 +6322,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, cam, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } @@ -6462,7 +6528,7 @@ func TestIngester_OpenExistingTSDBOnStartup(t *testing.T) { // setup the tsdbs dir testData.setup(t, tempDir) - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) startErr := services.StartAndAwaitRunning(context.Background(), ingester) @@ -7622,7 +7688,7 @@ func TestHeadCompactionOnStartup(t *testing.T) { ingesterCfg.BlocksStorageConfig.Bucket.S3.Endpoint = "localhost" ingesterCfg.BlocksStorageConfig.TSDB.Retention = 2 * 24 * time.Hour // Make sure that no newly created blocks are deleted. - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingester)) diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 38e53d3c090..bf920c383a3 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -44,6 +44,8 @@ type ingesterMetrics struct { activeNativeHistogramBucketsPerUser *prometheus.GaugeVec activeNativeHistogramBucketsCustomTrackersPerUser *prometheus.GaugeVec + attributedActiveSeriesFailuresPerUser *prometheus.CounterVec + // Owned series ownedSeriesPerUser *prometheus.GaugeVec @@ -193,7 +195,10 @@ func newIngesterMetrics( Name: "cortex_ingester_owned_series", Help: "Number of currently owned series per user.", }, []string{"user"}), - + attributedActiveSeriesFailuresPerUser: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_attributed_active_series_failure", + Help: "The total number of failed active series decrement per user", + }, []string{"user"}), maxUsersGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ Name: instanceLimits, Help: instanceLimitsHelp, @@ -401,6 +406,7 @@ func (m *ingesterMetrics) deletePerUserMetrics(userID string) { m.maxLocalSeriesPerUser.DeleteLabelValues(userID) m.ownedSeriesPerUser.DeleteLabelValues(userID) + m.attributedActiveSeriesFailuresPerUser.DeleteLabelValues(userID) } func (m *ingesterMetrics) deletePerGroupMetricsForUser(userID, group string) { diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 5a3ed82c28c..2e3d40e0d3d 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,12 +619,15 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 + idx := u.Head().MustIndex() + defer idx.Close() + u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) { count++ } else { - u.activeSeries.Delete(refs[i]) + u.activeSeries.Delete(refs[i], idx) } } }) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index ca8ebfe53d5..540d0f7bbdd 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -52,6 +52,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -148,6 +149,10 @@ type Config struct { Common CommonConfig `yaml:"common"` TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` + CostAttributionRegistryPath string `yaml:"cost_attribution_registry_path" category:"experimental"` + CostAttributionCleanupInterval time.Duration `yaml:"cost_attribution_cleanup_interval" category:"experimental"` } // RegisterFlags registers flags. @@ -173,6 +178,9 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") + f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") + f.DurationVar(&c.CostAttributionCleanupInterval, "cost-attribution.cleanup-interval", 3*time.Minute, "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) @@ -739,6 +747,7 @@ type Mimir struct { BlockBuilderScheduler *blockbuilderscheduler.BlockBuilderScheduler ContinuousTestManager *continuoustest.Manager BuildInfoHandler http.Handler + CostAttributionManager *costattribution.Manager } // New makes a new Mimir. diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index a2839280bb9..0c8b1479802 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -43,6 +43,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -107,6 +108,7 @@ const ( BlockBuilderScheduler string = "block-builder-scheduler" ContinuousTest string = "continuous-test" All string = "all" + CostAttributionService string = "cost-attribution-service" // Write Read and Backend are the targets used when using the read-write deployment mode. Write string = "write" @@ -463,7 +465,9 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.PreferAvailabilityZone = t.Cfg.Querier.PreferAvailabilityZone t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.ActiveGroupsCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, + t.ActiveGroupsCleanup, t.CostAttributionManager, t.IngesterRing, t.IngesterPartitionInstanceRing, + canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return } @@ -645,6 +649,18 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { return t.ActiveGroupsCleanup, nil } +func (t *Mimir) initCostAttributionService() (services.Service, error) { + // The cost attribution service is only initilized if the custom registry path is provided. + if t.Cfg.CostAttributionRegistryPath != "" { + reg := prometheus.NewRegistry() + var err error + t.CostAttributionManager, err = costattribution.NewManager(t.Cfg.CostAttributionCleanupInterval, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.API.RegisterCostAttribution(t.Cfg.CostAttributionRegistryPath, reg) + return t.CostAttributionManager, err + } + return nil, nil +} + func (t *Mimir) tsdbIngesterConfig() { t.Cfg.Ingester.BlocksStorageConfig = t.Cfg.BlocksStorage } @@ -656,7 +672,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionManager, t.Registerer, util_log.Logger) if err != nil { return } @@ -1146,6 +1162,7 @@ func (t *Mimir) setupModuleManager() error { mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(ActiveGroupsCleanupService, t.initActiveGroupsCleanupService, modules.UserInvisibleModule) + mm.RegisterModule(CostAttributionService, t.initCostAttributionService, modules.UserInvisibleModule) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) mm.RegisterModule(Ingester, t.initIngester) @@ -1186,9 +1203,10 @@ func (t *Mimir) setupModuleManager() error { Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, - DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, + DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, + CostAttributionService: {API, Overrides}, Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, - IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, + IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation, Vault}, diff --git a/pkg/storage/soft_append_error_processor.go b/pkg/storage/soft_append_error_processor.go index 0f02131537d..6fdda3ae588 100644 --- a/pkg/storage/soft_append_error_processor.go +++ b/pkg/storage/soft_append_error_processor.go @@ -22,7 +22,7 @@ type SoftAppendErrorProcessor struct { errTooOldSample func(int64, []mimirpb.LabelAdapter) sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter) errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter) - maxSeriesPerUser func() + maxSeriesPerUser func(labels []mimirpb.LabelAdapter) maxSeriesPerMetric func(labels []mimirpb.LabelAdapter) errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter) errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter) @@ -39,7 +39,7 @@ func NewSoftAppendErrorProcessor( errTooOldSample func(int64, []mimirpb.LabelAdapter), sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter), errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter), - maxSeriesPerUser func(), + maxSeriesPerUser func([]mimirpb.LabelAdapter), maxSeriesPerMetric func(labels []mimirpb.LabelAdapter), errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter), errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter), @@ -89,7 +89,7 @@ func (e *SoftAppendErrorProcessor) ProcessErr(err error, ts int64, labels []mimi e.errDuplicateSampleForTimestamp(ts, labels) return true case errors.Is(err, globalerror.MaxSeriesPerUser): - e.maxSeriesPerUser() + e.maxSeriesPerUser(labels) return true case errors.Is(err, globalerror.MaxSeriesPerMetric): e.maxSeriesPerMetric(labels) diff --git a/pkg/streamingpromql/benchmarks/comparison_test.go b/pkg/streamingpromql/benchmarks/comparison_test.go index 5b26a5d6c45..4b147583d31 100644 --- a/pkg/streamingpromql/benchmarks/comparison_test.go +++ b/pkg/streamingpromql/benchmarks/comparison_test.go @@ -237,7 +237,7 @@ func createIngesterQueryable(t testing.TB, address string) storage.Queryable { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, ingestersRing, nil, false, nil, logger) + d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, nil, ingestersRing, nil, false, nil, logger) require.NoError(t, err) queryMetrics := stats.NewQueryMetrics(nil) diff --git a/pkg/streamingpromql/benchmarks/ingester.go b/pkg/streamingpromql/benchmarks/ingester.go index 6f3b5f04a9a..9107b66f64f 100644 --- a/pkg/streamingpromql/benchmarks/ingester.go +++ b/pkg/streamingpromql/benchmarks/ingester.go @@ -96,7 +96,7 @@ func startBenchmarkIngester(rootDataDir string) (*ingester.Ingester, string, fun return services.StopAndAwaitTerminated(context.Background(), ingestersRing) }) - ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, log.NewNopLogger()) + ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, nil, log.NewNopLogger()) if err != nil { cleanup() return nil, "", nil, fmt.Errorf("could not create ingester: %w", err) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 039929f045a..76614b8992c 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -62,6 +62,8 @@ const ( QueryIngestersWithinFlag = "querier.query-ingesters-within" AlertmanagerMaxGrafanaConfigSizeFlag = "alertmanager.max-grafana-config-size-bytes" AlertmanagerMaxGrafanaStateSizeFlag = "alertmanager.max-grafana-state-size-bytes" + costAttributionLabelsFlag = "validation.cost-attribution-labels" + maxCostAttributionLabelsPerUserFlag = "validation.max-cost-attribution-labels-per-user" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -70,6 +72,7 @@ const ( var ( errInvalidIngestStorageReadConsistency = fmt.Errorf("invalid ingest storage read consistency (supported values: %s)", strings.Join(api.ReadConsistencies, ", ")) errInvalidMaxEstimatedChunksPerQueryMultiplier = errors.New("invalid value for -" + MaxEstimatedChunksPerQueryMultiplierFlag + ": must be 0 or greater than or equal to 1") + errCostAttributionLabelsLimitExceeded = errors.New("invalid value for -" + costAttributionLabelsFlag + ": exceeds the limit defined by -" + maxCostAttributionLabelsPerUserFlag) ) // LimitError is a marker interface for the errors that do not comply with the specified limits. @@ -187,6 +190,12 @@ type Limits struct { LabelValuesMaxCardinalityLabelNamesPerRequest int `yaml:"label_values_max_cardinality_label_names_per_request" json:"label_values_max_cardinality_label_names_per_request"` ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` + // Cost attribution and limit. + CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionLabelsPerUser int `yaml:"max_cost_attribution_labels_per_user" json:"max_cost_attribution_labels_per_user" category:"experimental"` + MaxCostAttributionCardinalityPerUser int `yaml:"max_cost_attribution_cardinality_per_user" json:"max_cost_attribution_cardinality_per_user" category:"experimental"` + CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` + // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` @@ -300,6 +309,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.") + f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user.") + f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -476,6 +489,10 @@ func (l *Limits) validate() error { return errInvalidIngestStorageReadConsistency } + if len(l.CostAttributionLabels) > l.MaxCostAttributionLabelsPerUser { + return errCostAttributionLabelsLimitExceeded + } + return nil } @@ -797,6 +814,22 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } +func (o *Overrides) CostAttributionLabels(userID string) []string { + return o.getOverridesForUser(userID).CostAttributionLabels +} + +func (o *Overrides) MaxCostAttributionLabelsPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionLabelsPerUser +} + +func (o *Overrides) CostAttributionCooldown(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).CostAttributionCooldown) +} + +func (o *Overrides) MaxCostAttributionCardinalityPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionCardinalityPerUser +} + // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 9dc82df2d05..c56cb1ab026 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1076,6 +1076,12 @@ metric_relabel_configs: cfg: `ingest_storage_read_consistency: xyz`, expectedErr: errInvalidIngestStorageReadConsistency.Error(), }, + "should fail when cost_attribution_labels exceed max_cost_attribution_labels_per_user": { + cfg: ` +cost_attribution_labels: label1, label2, label3, +max_cost_attribution_labels_per_user: 2`, + expectedErr: errCostAttributionLabelsLimitExceeded.Error(), + }, } for testName, testData := range tests { diff --git a/vendor/github.com/prometheus/prometheus/tsdb/head_read.go b/vendor/github.com/prometheus/prometheus/tsdb/head_read.go index a3cd7b653d1..9ec0595dbdb 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/head_read.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/head_read.go @@ -37,6 +37,10 @@ func (h *Head) Index() (IndexReader, error) { return h.indexRange(math.MinInt64, math.MaxInt64), nil } +func (h *Head) MustIndex() IndexReader { + return h.indexRange(math.MinInt64, math.MaxInt64) +} + func (h *Head) indexRange(mint, maxt int64) *headIndexReader { if hmin := h.MinTime(); hmin > mint { mint = hmin diff --git a/vendor/modules.txt b/vendor/modules.txt index 02ca98ac952..cedb067c5ca 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1017,7 +1017,7 @@ github.com/prometheus/exporter-toolkit/web github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673 +# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 ## explicit; go 1.22.0 github.com/prometheus/prometheus/config github.com/prometheus/prometheus/discovery @@ -1688,7 +1688,7 @@ sigs.k8s.io/kustomize/kyaml/yaml/walk sigs.k8s.io/yaml sigs.k8s.io/yaml/goyaml.v2 sigs.k8s.io/yaml/goyaml.v3 -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673 +# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 # github.com/hashicorp/memberlist => github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe # gopkg.in/yaml.v3 => github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 # github.com/grafana/regexp => github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b