From e315ebb4c1c3dca30bf7c62eeefdef0b7ac8e5ad Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 24 Oct 2024 15:39:38 +0200 Subject: [PATCH] Poc: cost attribution proposal 2 --- cmd/mimir/config-descriptor.json | 66 ++++ cmd/mimir/help-all.txt.tmpl | 12 + .../config/mimir.yaml | 9 +- pkg/api/api.go | 6 + pkg/blockbuilder/tsdb.go | 2 +- pkg/costattribution/manager.go | 173 +++++++++ pkg/costattribution/manager_test.go | 193 ++++++++++ pkg/costattribution/tracker.go | 345 ++++++++++++++++++ pkg/costattribution/tracker_test.go | 163 +++++++++ pkg/distributor/allcase.txt | 90 +++++ pkg/distributor/distributor.go | 60 +-- pkg/distributor/distributor_test.go | 175 +++++---- pkg/distributor/validate.go | 22 +- pkg/distributor/validate_test.go | 14 +- .../activeseries/active_labels_test.go | 6 +- .../active_native_histogram_postings_test.go | 32 +- .../activeseries/active_postings_test.go | 21 +- pkg/ingester/activeseries/active_series.go | 112 ++++-- .../activeseries/active_series_test.go | 210 ++++++----- pkg/ingester/ingester.go | 63 +++- .../ingester_early_compaction_test.go | 2 +- pkg/ingester/ingester_ingest_storage_test.go | 2 +- pkg/ingester/ingester_test.go | 152 +++++--- pkg/ingester/user_tsdb.go | 4 +- pkg/mimir/mimir.go | 8 + pkg/mimir/modules.go | 26 +- pkg/storage/soft_append_error_processor.go | 6 +- .../benchmarks/comparison_test.go | 2 +- pkg/streamingpromql/benchmarks/ingester.go | 2 +- pkg/util/validation/limits.go | 33 ++ pkg/util/validation/limits_test.go | 6 + 31 files changed, 1670 insertions(+), 347 deletions(-) create mode 100644 pkg/costattribution/manager.go create mode 100644 pkg/costattribution/manager_test.go create mode 100644 pkg/costattribution/tracker.go create mode 100644 pkg/costattribution/tracker_test.go create mode 100644 pkg/distributor/allcase.txt diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 3ac7f5c294b..1334a1b047e 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4368,6 +4368,50 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_labels", + "required": false, + "desc": "Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-labels", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_labels_per_user", + "required": false, + "desc": "Maximum number of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 2, + "fieldFlag": "validation.max-cost-attribution-labels-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_cardinality_per_user", + "required": false, + "desc": "Maximum cardinality of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 10000, + "fieldFlag": "validation.max-cost-attribution-cardinality-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cooldown", + "required": false, + "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.cost-attribution-cooldown", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", @@ -19639,6 +19683,28 @@ "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled", "fieldType": "boolean", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.", + "fieldValue": null, + "fieldDefaultValue": 1200000000000, + "fieldFlag": "cost-attribution.eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_registry_path", + "required": false, + "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "cost-attribution.registry-path", + "fieldType": "string", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 08bc71314d3..0324a354ceb 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1283,6 +1283,10 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution.eviction-interval duration + [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) + -cost-attribution.registry-path string + [experimental] Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -3317,10 +3321,18 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-cooldown duration + [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit. + -validation.cost-attribution-labels comma-separated-list-of-strings + [experimental] Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-cardinality-per-user int + [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) + -validation.max-cost-attribution-labels-per-user int + [experimental] Maximum number of cost attribution labels allowed per user. (default 2) -validation.max-label-names-per-info-series int Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80) -validation.max-label-names-per-series int diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 5d245999115..31702611891 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,4 +1,6 @@ multitenancy_enabled: false +cost_attribution_registry_path: "/usage-metrics" +cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -184,5 +186,10 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 + cost_attribution_labels: "container" + max_cost_attribution_labels_per_user: 2 + max_cost_attribution_cardinality_per_user: 100 + cost_attribution_cooldown: 20m + runtime_config: - file: ./config/runtime.yaml + file: ./config/runtime.yaml \ No newline at end of file diff --git a/pkg/api/api.go b/pkg/api/api.go index e2f6da5735c..131b50643c1 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -20,6 +20,7 @@ import ( "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/server" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/grafana/mimir/pkg/alertmanager" "github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb" @@ -280,6 +281,11 @@ func (a *API) RegisterDistributor(d *distributor.Distributor, pushConfig distrib a.RegisterRoute("/distributor/ha_tracker", d.HATracker, false, true, "GET") } +// RegisterCostAttribution registers a Prometheus HTTP handler for the cost attribution metrics. +func (a *API) RegisterCostAttribution(customRegistryPath string, reg *prometheus.Registry) { + a.RegisterRoute(customRegistryPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), false, false, "GET") +} + // Ingester is defined as an interface to allow for alternative implementations // of ingesters to be passed into the API.RegisterIngester() method. type Ingester interface { diff --git a/pkg/blockbuilder/tsdb.go b/pkg/blockbuilder/tsdb.go index ee2d610fe78..97cf6ede36d 100644 --- a/pkg/blockbuilder/tsdb.go +++ b/pkg/blockbuilder/tsdb.go @@ -50,7 +50,7 @@ type TSDBBuilder struct { var softErrProcessor = mimir_storage.NewSoftAppendErrorProcessor( func() {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, - func() {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, + func([]mimirpb.LabelAdapter) {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, ) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go new file mode 100644 index 00000000000..0c60ed54505 --- /dev/null +++ b/pkg/costattribution/manager.go @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "context" + "sort" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + + "github.com/grafana/mimir/pkg/util/validation" +) + +const ( + TrackerLabel = "tracker" + TenantLabel = "tenant" + defaultTrackerName = "cost-attribution" + missingValue = "__missing__" + overflowValue = "__overflow__" +) + +type Manager struct { + services.Service + logger log.Logger + inactiveTimeout time.Duration + limits *validation.Overrides + + mtx sync.RWMutex + trackersByUserID map[string]*Tracker + reg *prometheus.Registry + cleanupInterval time.Duration + metricsExportInterval time.Duration +} + +func NewManager(cleanupInterval, exportInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { + m := &Manager{ + trackersByUserID: make(map[string]*Tracker), + limits: limits, + mtx: sync.RWMutex{}, + inactiveTimeout: inactiveTimeout, + logger: logger, + reg: reg, + cleanupInterval: cleanupInterval, + metricsExportInterval: exportInterval, + } + + m.Service = services.NewBasicService(nil, m.running, nil).WithName("cost attribution manager") + if err := reg.Register(m); err != nil { + return nil, err + } + return m, nil +} + +func (m *Manager) running(ctx context.Context) error { + t := time.NewTicker(m.cleanupInterval) + defer t.Stop() + + for { + select { + case <-t.C: + if err := m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()); err != nil { + return err + } + case <-ctx.Done(): + return nil + } + } +} + +func (m *Manager) EnabledForUser(userID string) bool { + if m == nil { + return false + } + return len(m.limits.CostAttributionLabels(userID)) > 0 +} + +func (m *Manager) TrackerForUser(userID string) *Tracker { + if !m.EnabledForUser(userID) { + return nil + } + + m.mtx.Lock() + defer m.mtx.Unlock() + + if tracker, exists := m.trackersByUserID[userID]; exists { + return tracker + } + + tracker := newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = tracker + return tracker +} + +func (m *Manager) Collect(out chan<- prometheus.Metric) { + m.mtx.RLock() + defer m.mtx.RUnlock() + for _, tracker := range m.trackersByUserID { + tracker.Collect(out) + } +} + +func (m *Manager) Describe(chan<- *prometheus.Desc) { +} + +func (m *Manager) deleteUserTracker(userID string) { + m.mtx.Lock() + defer m.mtx.Unlock() + delete(m.trackersByUserID, userID) +} + +func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { + m.mtx.RLock() + userIDs := make([]string, 0, len(m.trackersByUserID)) + for userID := range m.trackersByUserID { + userIDs = append(userIDs, userID) + } + m.mtx.RUnlock() + + for _, userID := range userIDs { + if !m.EnabledForUser(userID) { + m.deleteUserTracker(userID) + continue + } + + invalidKeys := m.inactiveObservationsForUser(userID, deadline) + cat := m.TrackerForUser(userID) + for _, key := range invalidKeys { + cat.cleanupTrackerAttribution(key) + } + + if cat != nil && cat.cooldownUntil != nil && cat.cooldownUntil.Load() < deadline { + if len(cat.observed) <= cat.MaxCardinality() { + cat.state = OverflowComplete + m.deleteUserTracker(userID) + } else { + cat.cooldownUntil.Store(deadline + cat.cooldownDuration) + } + } + } + return nil +} + +func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []string { + cat := m.TrackerForUser(userID) + newTrackedLabels := m.limits.CostAttributionLabels(userID) + sort.Slice(newTrackedLabels, func(i, j int) bool { + return newTrackedLabels[i] < newTrackedLabels[j] + }) + + if !cat.CompareCALabels(newTrackedLabels) { + m.mtx.Lock() + cat = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = cat + m.mtx.Unlock() + return nil + } else { + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + if cat.MaxCardinality() != maxCardinality { + cat.UpdateMaxCardinality(maxCardinality) + } + + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) + if cooldown != cat.CooldownDuration() { + cat.UpdateCooldownDuration(cooldown) + } + } + + return cat.InactiveObservations(deadline) +} diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go new file mode 100644 index 00000000000..1e67704b287 --- /dev/null +++ b/pkg/costattribution/manager_test.go @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + + "github.com/grafana/mimir/pkg/util/validation" +) + +func getMockLimits(idx int) (*validation.Overrides, error) { + baseLimits := map[string]*validation.Limits{ + "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, + "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, + "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, + "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, + } + + switch idx { + case 1: + baseLimits["user1"].CostAttributionLabels = []string{} + case 2: + baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} + case 3: + baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 + case 4: + baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 + case 5: + baseLimits["user1"].CostAttributionLabels = []string{"department"} + } + + return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) +} + +func newTestManager() *Manager { + logger := log.NewNopLogger() + limits, _ := getMockLimits(0) + reg := prometheus.NewRegistry() + manager, err := NewManager(5*time.Second, time.Second, 10*time.Second, logger, limits, reg) + if err != nil { + panic(err) + } + return manager +} + +func Test_NewManager(t *testing.T) { + manager := newTestManager() + assert.NotNil(t, manager) + assert.NotNil(t, manager.trackersByUserID) + assert.Equal(t, 10*time.Second, manager.inactiveTimeout) +} + +func Test_CreateDeleteTracker(t *testing.T) { + manager := newTestManager() + + t.Run("Tracker existence and attributes", func(t *testing.T) { + user1Tracker := manager.TrackerForUser("user1") + assert.NotNil(t, user1Tracker) + assert.True(t, user1Tracker.CompareCALabels([]string{"team"})) + assert.Equal(t, 5, user1Tracker.MaxCardinality()) + + assert.Nil(t, manager.TrackerForUser("user2")) + + user3Tracker := manager.TrackerForUser("user3") + assert.NotNil(t, user3Tracker) + assert.True(t, user3Tracker.CompareCALabels([]string{"department", "service"})) + assert.Equal(t, 2, user3Tracker.MaxCardinality()) + }) + + t.Run("Metrics tracking", func(t *testing.T) { + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "bar"), 1, "invalid-metrics-name", time.Unix(6, 0)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="bar",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Purge inactive attributions", func(t *testing.T) { + manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Disabling user cost attribution", func(t *testing.T) { + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 1, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Updating user cardinality and labels", func(t *testing.T) { + manager.limits, _ = getMockLimits(2) + manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix()) + assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.True(t, manager.TrackerForUser("user3").CompareCALabels([]string{"feature", "team"})) + + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{feature="__missing__",reason="invalid-metrics-name",team="foo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="cost-attribution"} 2 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) +} + +func Test_PurgeInactiveAttributionsUntil(t *testing.T) { + manager := newTestManager() + + manager.TrackerForUser("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) + + t.Run("Purge before inactive timeout", func(t *testing.T) { + manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix()) + assert.Equal(t, 2, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Purge after inactive timeout", func(t *testing.T) { + // disable cost attribution for user1 to test purging + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix()) + + // User3's tracker should remain since it's active, user1's tracker should be removed + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") + assert.Nil(t, manager.TrackerForUser("user1"), "Expected user1 tracker to be purged") + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Purge all trackers", func(t *testing.T) { + // Trigger a purge that should remove all inactive trackers + manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix()) + + // Tracker would stay at 1 since user1's tracker is disabled + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") + + // No metrics should remain after all purged + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go new file mode 100644 index 00000000000..0a232195848 --- /dev/null +++ b/pkg/costattribution/tracker.go @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "bytes" + "sort" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + "go.uber.org/atomic" +) + +type TrackerState int + +const ( + Normal TrackerState = iota + Overflow + OverflowComplete +) + +const sep = rune(0x80) + +type Observation struct { + lastUpdate *atomic.Int64 + activeSerie *atomic.Float64 + receivedSample *atomic.Float64 + discardSamplemtx sync.Mutex + discardedSample map[string]*atomic.Float64 + totalDiscarded *atomic.Float64 +} + +type Tracker struct { + userID string + caLabels []string + caLabelMap map[string]int + maxCardinality int + activeSeriesPerUserAttribution *prometheus.Desc + receivedSamplesAttribution *prometheus.Desc + discardedSampleAttribution *prometheus.Desc + overflowLabels []string + obseveredMtx sync.RWMutex + observed map[string]*Observation + hashBuffer []byte + state TrackerState + overflowCounter *Observation + cooldownUntil *atomic.Int64 + cooldownDuration int64 + logger log.Logger +} + +func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *Tracker { + sort.Slice(trackedLabels, func(i, j int) bool { + return trackedLabels[i] < trackedLabels[j] + }) + + // Create a map for fast lookup, and overflow labels to export when overflow happens + caLabelMap := make(map[string]int, len(trackedLabels)) + overflowLabels := make([]string, len(trackedLabels)+2) + for i, label := range trackedLabels { + caLabelMap[label] = i + overflowLabels[i] = overflowValue + } + + overflowLabels[len(trackedLabels)] = userID + overflowLabels[len(trackedLabels)+1] = overflowValue + + tracker := &Tracker{ + userID: userID, + caLabels: trackedLabels, + caLabelMap: caLabelMap, + maxCardinality: limit, + observed: make(map[string]*Observation), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64(cooldown.Seconds()), + logger: logger, + overflowLabels: overflowLabels, + } + + tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", + "The total number of samples that were discarded per attribution.", + append(trackedLabels, TenantLabel, "reason"), + prometheus.Labels{TrackerLabel: defaultTrackerName}) + + tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", + "The total number of samples that were received per attribution.", + append(trackedLabels, TenantLabel), + prometheus.Labels{TrackerLabel: defaultTrackerName}) + + tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", + "The total number of active series per user and attribution.", append(trackedLabels, TenantLabel), + prometheus.Labels{TrackerLabel: defaultTrackerName}) + + return tracker +} + +func (t *Tracker) CompareCALabels(currentLabels []string) bool { + if t == nil { + return len(currentLabels) == 0 + } + if len(t.caLabels) != len(currentLabels) { + return false + } + for _, v := range currentLabels { + if _, exists := t.caLabelMap[v]; !exists { + return false + } + } + return true +} + +func (t *Tracker) MaxCardinality() int { + if t == nil { + return 0 + } + return t.maxCardinality +} + +func (t *Tracker) CooldownDuration() int64 { + if t == nil { + return 0 + } + return t.cooldownDuration +} + +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +func (t *Tracker) cleanupTrackerAttribution(key string) { + if t == nil { + return + } + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() + delete(t.observed, key) +} + +func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil) +} + +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), -1, 0, 0, nil) +} + +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + switch t.state { + case Overflow: + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) + case Normal: + // Collect metrics for all observed streams + t.obseveredMtx.RLock() + defer t.obseveredMtx.RUnlock() + for key, o := range t.observed { + keys := strings.Split(key, string(sep)) + keys = append(keys, t.userID) + if o.activeSerie.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) + } + if o.receivedSample.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) + } + o.discardSamplemtx.Lock() + for reason, discarded := range o.discardedSample { + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) + } + o.discardSamplemtx.Unlock() + } + } +} + +func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 0, 0, value, &reason) +} + +func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 0, value, 0, nil) +} + +func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + labelValues := make([]string, len(t.caLabels)) + lbls.Range(func(l labels.Label) { + if idx, ok := t.caLabelMap[l.Name]; ok { + labelValues[idx] = l.Value + } + }) + for i := 0; i < len(labelValues); i++ { + if labelValues[i] == "" { + labelValues[i] = missingValue + } + } + + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + + // Build the stream key + for i, value := range labelValues { + if i > 0 { + buf.WriteRune(sep) + } + buf.WriteString(value) + } + + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() + + t.updateOverflow(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) +} + +// handleObservation updates or creates a new stream observation in the 'observed' map. +func (t *Tracker) handleObservation(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + if o, known := t.observed[stream]; known && o.lastUpdate != nil { + // Update the timestamp if needed + if o.lastUpdate.Load() < ts { + o.lastUpdate.Store(ts) + } + if activeSeriesIncrement != 0 { + o.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + o.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 && reason != nil { + o.discardSamplemtx.Lock() + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + o.discardSamplemtx.Unlock() + } + } else if len(t.observed) < t.maxCardinality*2 { + // Create a new observation for the stream + t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + } +} + +func (t *Tracker) updateOverflow(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + // Update the stream in the observed map + t.handleObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.handleOverflow(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) +} + +// handleOverflow checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. +func (t *Tracker) handleOverflow(ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { + // Transition to overflow mode if maximum cardinality is exceeded. + previousState := t.state + if t.state == Normal && len(t.observed) > t.maxCardinality { + t.state = Overflow + // Initialize the overflow counter. + t.overflowCounter = &Observation{ + lastUpdate: atomic.NewInt64(ts), + activeSerie: atomic.NewFloat64(0), + receivedSample: atomic.NewFloat64(0), + totalDiscarded: atomic.NewFloat64(0), + } + + // Aggregate active series from all streams into the overflow counter. + for _, o := range t.observed { + if o != nil { + t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) + } + } + t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) + } + + if t.state == Overflow { + // if already in overflow mode, update the overflow counter. If it was normal mode, the active series are already applied. + if previousState == Overflow && activeSeriesIncrement != 0 { + t.overflowCounter.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 { + t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + } + } +} + +// createNewObservation creates a new observation in the 'observed' map. +func (t *Tracker) createNewObservation(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + t.observed[stream] = &Observation{ + lastUpdate: atomic.NewInt64(ts), + activeSerie: atomic.NewFloat64(activeSeriesIncrement), + receivedSample: atomic.NewFloat64(receivedSampleIncrement), + discardedSample: map[string]*atomic.Float64{}, + discardSamplemtx: sync.Mutex{}, + } + if discardedSampleIncrement > 0 && reason != nil { + t.observed[stream].discardSamplemtx.Lock() + t.observed[stream].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[stream].discardSamplemtx.Unlock() + } +} + +func (t *Tracker) InactiveObservations(deadline int64) []string { + if t == nil { + return nil + } + + // otherwise, we need to check all observations and clean up the ones that are inactive + var invalidKeys []string + t.obseveredMtx.RLock() + defer t.obseveredMtx.RUnlock() + for labkey, ob := range t.observed { + if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { + invalidKeys = append(invalidKeys, labkey) + } + } + + return invalidKeys +} + +func (t *Tracker) UpdateMaxCardinality(limit int) { + if t == nil { + return + } + t.maxCardinality = limit +} + +func (t *Tracker) UpdateCooldownDuration(cooldownDuration int64) { + if t == nil { + return + } + t.cooldownDuration = cooldownDuration +} diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go new file mode 100644 index 00000000000..82de4e8b64c --- /dev/null +++ b/pkg/costattribution/tracker_test.go @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "sync" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_GetCALabels(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.True(t, cat.CompareCALabels([]string{"team"}), "Expected cost attribution labels mismatch") +} + +func Test_GetMaxCardinality(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.Equal(t, 5, cat.MaxCardinality(), "Expected max cardinality mismatch") +} + +func Test_CreateCleanupTracker(t *testing.T) { + tManager := newTestManager() + cat := tManager.TrackerForUser("user4") + + reg := prometheus.NewRegistry() + err := reg.Register(tManager) + require.NoError(t, err) + + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3"), time.Unix(3, 0)) + cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) + cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) + + cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 + ` + + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + "cortex_ingester_attributed_active_series", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.Equal(t, []string{"foo"}, cat.InactiveObservations(5)) + tManager.purgeInactiveAttributionsUntil(5) + + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + tManager.deleteUserTracker("user4") + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) +} + +func Test_UpdateCounters(t *testing.T) { + cat := newTestManager().TrackerForUser("user3") + lbls1 := labels.FromStrings("department", "foo", "service", "bar") + lbls2 := labels.FromStrings("department", "bar", "service", "baz") + lbls3 := labels.FromStrings("department", "baz", "service", "foo") + + cat.updateCounters(lbls1, 1, 1, 0, 0, nil) + assert.Equal(t, Normal, cat.state, "First observation, should not overflow") + + cat.updateCounters(lbls2, 2, 1, 0, 0, nil) + assert.Equal(t, Normal, cat.state, "Second observation, should not overflow") + + cat.updateCounters(lbls3, 3, 1, 0, 0, nil) + assert.Equal(t, Overflow, cat.state, "Third observation, should overflow") + + cat.updateCounters(lbls3, 4, 1, 0, 0, nil) + assert.Equal(t, Overflow, cat.state, "Fourth observation, should stay overflow") + + assert.Equal(t, int64(3+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") +} + +func Test_GetInactiveObservations(t *testing.T) { + // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. + cat := newTestManager().TrackerForUser("user1") + + // Create two observations with different last update timestamps. + observations := []labels.Labels{ + labels.FromStrings("team", "foo"), + labels.FromStrings("team", "bar"), + labels.FromStrings("team", "baz"), + } + // Simulate samples discarded with different timestamps. + cat.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + cat.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) + cat.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) + + // Ensure that two observations were successfully added to the tracker. + require.Len(t, cat.observed, 3) + + // Purge observations that haven't been updated in the last 10 seconds. + purged := cat.InactiveObservations(0) + require.Len(t, purged, 0) + + purged = cat.InactiveObservations(10) + assert.ElementsMatch(t, []string{"foo"}, purged) + + purged = cat.InactiveObservations(15) + assert.ElementsMatch(t, []string{"foo", "bar"}, purged) + + // Check that the purged observation matches the expected details. + purged = cat.InactiveObservations(25) + assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) +} + +func Test_UpdateMaxCardinality(t *testing.T) { + // user1 original max cardinality is 5 + cat := newTestManager().TrackerForUser("user1") + cat.UpdateMaxCardinality(2) + assert.Equal(t, 2, cat.MaxCardinality(), "Expected max cardinality update to 2") +} + +func Test_Concurrency(t *testing.T) { + m := newTestManager() + cat := m.TrackerForUser("user1") + + var wg sync.WaitGroup + for i := 0; i < 100; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) + cat.updateCounters(lbls, int64(i), 1, 0, 0, nil) + }(i) + } + wg.Wait() + + // Verify no data races or inconsistencies + assert.True(t, len(cat.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(cat.observed), 2*cat.MaxCardinality(), "Observed count should not exceed 2 times of max cardinality") + assert.Equal(t, Overflow, cat.state, "Expected state to be Overflow") + + expectedMetrics := ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{team="__overflow__",tenant="user1",tracker="cost-attribution"} 100 +` + assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) +} diff --git a/pkg/distributor/allcase.txt b/pkg/distributor/allcase.txt new file mode 100644 index 00000000000..5efb38bff35 --- /dev/null +++ b/pkg/distributor/allcase.txt @@ -0,0 +1,90 @@ +goos: darwin +goarch: amd64 +pkg: github.com/grafana/mimir/pkg/distributor +cpu: Intel(R) Core(TM) i5-1038NG7 CPU @ 2.00GHz +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 292 4093113 ns/op 1137807 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 295 4286668 ns/op 1136742 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 258 4621600 ns/op 1137652 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 300 4381770 ns/op 1137330 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 306 3978604 ns/op 1138153 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 303 3889851 ns/op 1136827 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 217 5309972 ns/op 1218313 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 223 5308695 ns/op 1218015 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 225 5686183 ns/op 1220126 B/op 6060 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 222 5320854 ns/op 1219277 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 224 5362158 ns/op 1218447 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 222 5352613 ns/op 1218641 B/op 6060 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 720 1637728 ns/op 324601 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 668 1699484 ns/op 324867 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 704 1650014 ns/op 324865 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 697 1678209 ns/op 324811 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 712 1679228 ns/op 324811 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 720 1650075 ns/op 325052 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 312 3780976 ns/op 1571034 B/op 7090 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 308 3830179 ns/op 1572930 B/op 7104 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 308 3778948 ns/op 1567952 B/op 7089 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 288 4163770 ns/op 1559790 B/op 7088 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 310 3775677 ns/op 1565793 B/op 7093 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 309 4826310 ns/op 1566713 B/op 7091 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 652 1911060 ns/op 165520 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 657 1825805 ns/op 167283 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 631 1823762 ns/op 166046 B/op 81 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 639 1800926 ns/op 167361 B/op 84 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 645 1801281 ns/op 165645 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 646 1813022 ns/op 166700 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1327 906046 ns/op 2407 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1261 894881 ns/op 2523 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1237 905868 ns/op 2527 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1347 883890 ns/op 2510 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1269 880076 ns/op 2520 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1333 884934 ns/op 2484 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 6823420 ns/op 1201064 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 5941364 ns/op 1201755 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 6066547 ns/op 1200638 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 193 5998870 ns/op 1201690 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 201 5828347 ns/op 1201056 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 193 5906302 ns/op 1200750 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 291 4090687 ns/op 1590964 B/op 8098 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 291 4113064 ns/op 1589749 B/op 8091 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 265 4166235 ns/op 1583910 B/op 8096 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 283 4157170 ns/op 1583275 B/op 8099 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 237 4237111 ns/op 1586094 B/op 8093 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 285 4207373 ns/op 1585480 B/op 8095 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 550 2176540 ns/op 183504 B/op 1081 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 502 2186461 ns/op 183481 B/op 1080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 526 2187088 ns/op 181204 B/op 1080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 504 2205968 ns/op 182120 B/op 1079 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 531 2192123 ns/op 182981 B/op 1079 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 525 2195721 ns/op 182929 B/op 1080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1226 986827 ns/op 2559 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1179 980126 ns/op 2446 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1222 971585 ns/op 2496 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1154 983680 ns/op 2541 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1215 959667 ns/op 2529 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1222 983919 ns/op 2558 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 181 10726471 ns/op 1226302 B/op 7062 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 170 7175109 ns/op 1224269 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 184 6481711 ns/op 1225092 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 182 6501399 ns/op 1224896 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 181 7033662 ns/op 1225391 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 177 6617141 ns/op 1224477 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4385703 ns/op 1162346 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 272 4401598 ns/op 1161965 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4378841 ns/op 1161221 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 266 4438176 ns/op 1161650 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 268 4528658 ns/op 1161541 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 264 4430113 ns/op 1161600 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 6302555 ns/op 1243108 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 5960008 ns/op 1241662 B/op 8059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 199 6671300 ns/op 1243085 B/op 8061 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 202 5823528 ns/op 1241662 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 208 5834922 ns/op 1241914 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 5758215 ns/op 1242172 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 554 2115840 ns/op 348972 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 564 2145631 ns/op 348762 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 566 2088044 ns/op 349132 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 562 2152042 ns/op 349683 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 544 2103713 ns/op 348848 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 531 2125180 ns/op 349253 B/op 6055 allocs/op +PASS +ok github.com/grafana/mimir/pkg/distributor 176.572s diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 7bf589f7bc4..3594123435d 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -34,20 +34,8 @@ import ( "github.com/grafana/dskit/services" "github.com/grafana/dskit/tenant" "github.com/grafana/dskit/user" - "github.com/opentracing/opentracing-go" - "github.com/opentracing/opentracing-go/ext" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/model/relabel" - "github.com/prometheus/prometheus/scrape" - "go.uber.org/atomic" - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" - "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" @@ -60,6 +48,18 @@ import ( "github.com/grafana/mimir/pkg/util/pool" "github.com/grafana/mimir/pkg/util/spanlogger" "github.com/grafana/mimir/pkg/util/validation" + "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go/ext" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "github.com/prometheus/prometheus/scrape" + "go.uber.org/atomic" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" ) func init() { @@ -112,6 +112,7 @@ type Distributor struct { distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker haTracker @@ -328,7 +329,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -349,6 +350,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove ingesterPool: NewPool(cfg.PoolConfig, ingestersRing, cfg.IngesterClientFactory, log), healthyInstancesCount: atomic.NewUint32(0), limits: limits, + costAttributionMgr: costAttributionMgr, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -742,13 +744,16 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { + +func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Samples) == 0 { return nil } + cat := d.costAttributionMgr.TrackerForUser(userID) + if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0]) + return validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -762,7 +767,7 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s); err != nil { + if err := validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } @@ -782,13 +787,14 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { +func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Histograms) == 0 { return nil } + cat := d.costAttributionMgr.TrackerForUser(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { return err } @@ -801,6 +807,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim timestamps := make(map[int64]struct{}, min(len(ts.Histograms), 100)) currPos := 0 histogramsUpdated := false + for idx := range ts.Histograms { if _, ok := timestamps[ts.Histograms[idx].Timestamp]; ok { // A sample with the same timestamp has already been validated, so we skip it. @@ -809,7 +816,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) if err != nil { return err } @@ -873,10 +880,10 @@ func (d *Distributor) validateExemplars(ts *mimirpb.PreallocTimeseries, userID s // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) (bool, error) { - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation); err != nil { + cat := d.costAttributionMgr.TrackerForUser(userID) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } - now := model.TimeFromUnixNano(nowt.UnixNano()) totalSamplesAndHistograms := len(ts.Samples) + len(ts.Histograms) @@ -966,7 +973,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), time.Now()) + tnow := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), tnow) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -980,6 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, tnow) } return err @@ -1237,6 +1246,9 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { + if len(req.Timeseries) > 0 { + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1817,9 +1829,11 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int + for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) + d.costAttributionMgr.TrackerForUser(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 76a27fff797..f115e9626f6 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -54,6 +54,7 @@ import ( "google.golang.org/grpc/metadata" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -2114,7 +2115,7 @@ func mkLabels(n int, extra ...string) []mimirpb.LabelAdapter { ret[i+1] = mimirpb.LabelAdapter{Name: fmt.Sprintf("name_%d", i), Value: fmt.Sprintf("value_%d", i)} } for i := 0; i < len(extra); i += 2 { - ret[i+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} + ret[i/2+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} } slices.SortFunc(ret, func(a, b mimirpb.LabelAdapter) int { switch { @@ -2147,7 +2148,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2168,7 +2169,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2188,7 +2189,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(31) + metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2209,7 +2210,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long name. - metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx") + metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx", "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2230,7 +2231,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long value. - metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1)) + metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1), "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2250,7 +2251,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().Add(time.Hour).UnixNano() / int64(time.Millisecond), @@ -2261,7 +2262,7 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a sample whose timestamp is too far in the future", }, - "all samples go to metric_relabel_configs": { + "all samples go to metric relabel configs": { prepareConfig: func(limits *validation.Limits) { limits.MetricRelabelConfigs = []*relabel.Config{ { @@ -2278,7 +2279,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2291,78 +2292,110 @@ func BenchmarkDistributor_Push(b *testing.B) { }, } - for testName, testData := range tests { - b.Run(testName, func(b *testing.B) { - // Create an in-memory KV store for the ring with 1 ingester registered. - kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) - b.Cleanup(func() { assert.NoError(b, closer.Close()) }) + costAttributionCases := []struct { + state string + customRegistry *prometheus.Registry + cfg func(limits *validation.Limits) + }{ + { + state: "disabled", + customRegistry: nil, + cfg: func(_ *validation.Limits) {}, + }, + { + state: "enabled", + customRegistry: prometheus.NewRegistry(), + cfg: func(limits *validation.Limits) { + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + }, + } - err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, - func(_ interface{}) (interface{}, bool, error) { - d := &ring.Desc{} - d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) - return d, true, nil - }, - ) - require.NoError(b, err) - - ingestersRing, err := ring.New(ring.Config{ - KVStore: kv.Config{Mock: kvStore}, - HeartbeatTimeout: 60 * time.Minute, - ReplicationFactor: 1, - }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) - }) + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for testName, testData := range tests { + b.Run(fmt.Sprintf("scenario=%s", testName), func(b *testing.B) { + // Create an in-memory KV store for the ring with 1 ingester registered. + kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + b.Cleanup(func() { assert.NoError(b, closer.Close()) }) - test.Poll(b, time.Second, 1, func() interface{} { - return ingestersRing.InstancesCount() - }) + err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, + func(_ interface{}) (interface{}, bool, error) { + d := &ring.Desc{} + d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) + return d, true, nil + }, + ) + require.NoError(b, err) + + ingestersRing, err := ring.New(ring.Config{ + KVStore: kv.Config{Mock: kvStore}, + HeartbeatTimeout: 60 * time.Minute, + ReplicationFactor: 1, + }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) + }) - // Prepare the distributor configuration. - var distributorCfg Config - var clientConfig client.Config - limits := validation.Limits{} - flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) - distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" + test.Poll(b, time.Second, 1, func() interface{} { + return ingestersRing.InstancesCount() + }) - limits.IngestionRate = float64(rate.Inf) // Unlimited. - testData.prepareConfig(&limits) + // Prepare the distributor configuration. + var distributorCfg Config + var clientConfig client.Config + limits := validation.Limits{} + flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) + distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" - distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { - return &noopIngester{}, nil - }) + limits.IngestionRate = float64(rate.Inf) // Unlimited. + testData.prepareConfig(&limits) - overrides, err := validation.NewOverrides(limits, nil) - require.NoError(b, err) + distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { + return &noopIngester{}, nil + }) - // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) + caCase.cfg(&limits) + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(b, err) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) - }) + // Initialize the cost attribution manager + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } - // Prepare the series to remote write before starting the benchmark. - metrics, samples := testData.prepareSeries() + // Start the distributor. + distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) - // Run the benchmark. - b.ReportAllocs() - b.ResetTimer() + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) + }) - for n := 0; n < b.N; n++ { - _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + // Prepare the series to remote write before starting the benchmark. + metrics, samples := testData.prepareSeries() - if testData.expectedErr == "" && err != nil { - b.Fatalf("no error expected but got %v", err) - } - if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { - b.Fatalf("expected %v error but got %v", testData.expectedErr, err) - } + // Run the benchmark. + b.ReportAllocs() + b.ResetTimer() + + for n := 0; n < b.N; n++ { + _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + + if testData.expectedErr == "" && err != nil { + b.Fatalf("no error expected but got %v", err) + } + if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { + b.Fatalf("expected %v error but got %v", testData.expectedErr, err) + } + } + }) } }) } @@ -5627,7 +5660,7 @@ func prepare(t testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []* require.NoError(t, err) reg := prometheus.NewPedanticRegistry() - d, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) + d, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, d)) t.Cleanup(func() { @@ -8263,7 +8296,7 @@ func TestCheckStartedMiddleware(t *testing.T) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - distributor, err := New(distributorConfig, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorConfig, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(t, err) ctx := user.InjectOrgID(context.Background(), "user") diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index ab9426513ad..8b9849ba730 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/extract" "github.com/grafana/mimir/pkg/util/globalerror" @@ -238,15 +239,17 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -257,20 +260,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -284,6 +290,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -291,6 +298,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -392,14 +400,16 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -408,11 +418,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -424,17 +436,21 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, strings.ToValidUTF8(l.Value, ""), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index df4de2dd60f..c84ed0b58a8 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -60,6 +60,7 @@ func (vm validateMetadataCfg) MaxMetadataLength(_ string) int { } func TestValidateLabels(t *testing.T) { + ts := time.Now() reg := prometheus.NewPedanticRegistry() s := newSampleValidationMetrics(reg) @@ -222,7 +223,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -416,17 +417,17 @@ func TestValidateMetadata(t *testing.T) { } func TestValidateLabelDuplication(t *testing.T) { + ts := time.Now() var cfg validateLabelsCfg cfg.maxLabelNameLength = 10 cfg.maxLabelNamesPerSeries = 10 cfg.maxLabelValueLength = 10 userID := "testUser" - actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false) + }, false, false, nil, ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -443,7 +444,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false) + }, false, false, nil, ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -594,7 +595,6 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { registry := prometheus.NewRegistry() metrics := newSampleValidationMetrics(registry) - for _, limit := range []int{0, 1, 2} { for name, h := range testCases { t.Run(fmt.Sprintf("limit-%d-%s", limit, name), func(t *testing.T) { @@ -602,7 +602,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) if limit == 1 { require.Error(t, err) @@ -649,7 +649,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index aa7f928d7dd..6fdf3e00bc4 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) memPostings := index.NewMemPostings() for i, l := range series { @@ -51,10 +51,10 @@ func TestIsLabelValueActive(t *testing.T) { // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) require.True(t, valid) result, err := IsLabelValueActive(ctx, reader, activeSeries, "a", "1") diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 665f5787c61..2b95020c68d 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -34,10 +34,10 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { if i+1 == 3 || i+1 == 4 { buckets = 10 // Native histogram with 10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -62,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -70,10 +70,10 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { if i == 2 || i == 3 { buckets = i * 10 // Native histogram with i*10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 5, allActive) @@ -106,17 +106,18 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 if i+1 == 4 { buckets = -1 // Make ref==4 not a native histogram to check that Seek skips it. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -145,14 +146,15 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -181,14 +183,14 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a2345841d11..84c71634e72 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,13 +26,14 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -57,13 +58,14 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -88,13 +90,14 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 71044b5e348..e7895404a22 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -13,10 +13,12 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -44,10 +46,11 @@ type ActiveSeries struct { stripes [numStripes]seriesStripe deleted deletedSeries - // matchersMutex protects matchers and lastMatchersUpdate. - matchersMutex sync.RWMutex - matchers *asmodel.Matchers - lastMatchersUpdate time.Time + // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat + configMutex sync.RWMutex + matchers *asmodel.Matchers + cat *costattribution.Tracker + lastConfigUpdate time.Time // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -63,8 +66,8 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 - + oldestEntryTs atomic.Int64 + cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -84,50 +87,61 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout} +func NewActiveSeries( + asm *asmodel.Matchers, + timeout time.Duration, + cat *costattribution.Tracker, +) *ActiveSeries { + c := &ActiveSeries{ + matchers: asm, timeout: timeout, cat: cat, + } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, cat) } return c } func (c *ActiveSeries) CurrentMatcherNames() []string { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.matchers.MatcherNames() } +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { + currentCTC, currentCAT := c.CurrentConfig() + // TODO: I think here to check the pointer is not equal is already enough, if we recreate tracker, it is for a good reason, otherwise, nothing changed + return ctCfg.String() != currentCTC.String() || caCfg != currentCAT //|| !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) +} + func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() + c.configMutex.Lock() + defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm - c.lastMatchersUpdate = now + c.lastConfigUpdate = now } -func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() - return c.matchers.Config() +func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattribution.Tracker) { + c.configMutex.RLock() + defer c.configMutex.RUnlock() + return c.matchers.Config(), c.cat } // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. -func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int) { +func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes - created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { deletedStripeID := deleted.ref % numStripes - c.stripes[deletedStripeID].remove(deleted.ref) + c.stripes[deletedStripeID].remove(deleted.ref, idx) } } } @@ -149,19 +163,19 @@ func (c *ActiveSeries) PostDeletion(deleted map[chunks.HeadSeriesRef]labels.Labe // Purge purges expired entries and returns true if enough time has passed since // last reload. This should be called periodically to avoid unbounded memory // growth. -func (c *ActiveSeries) Purge(now time.Time) bool { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() +func (c *ActiveSeries) Purge(now time.Time, idx tsdb.IndexReader) bool { + c.configMutex.Lock() + defer c.configMutex.Unlock() purgeTime := now.Add(-c.timeout) - c.purge(purgeTime) + c.purge(purgeTime, idx) - return !c.lastMatchersUpdate.After(purgeTime) + return !c.lastConfigUpdate.After(purgeTime) } // purge removes expired entries from the cache. -func (c *ActiveSeries) purge(keepUntil time.Time) { +func (c *ActiveSeries) purge(keepUntil time.Time, idx tsdb.IndexReader) { for s := 0; s < numStripes; s++ { - c.stripes[s].purge(keepUntil) + c.stripes[s].purge(keepUntil, idx) } } @@ -196,8 +210,8 @@ func (c *ActiveSeries) Active() (total, totalNativeHistograms, totalNativeHistog // of buckets in those active native histogram series. This method does not purge // expired entries, so Purge should be called periodically. func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, totalNativeHistograms int, totalMatchingNativeHistograms []int, totalNativeHistogramBuckets int, totalMatchingNativeHistogramBuckets []int) { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() totalMatching = make([]int, len(c.matchers.MatcherNames())) totalMatchingNativeHistograms = make([]int, len(c.matchers.MatcherNames())) @@ -212,9 +226,9 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } -func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { +func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef, idx tsdb.IndexReader) { stripeID := storage.SeriesRef(ref) % numStripes - c.stripes[stripeID].remove(storage.SeriesRef(ref)) + c.stripes[stripeID].remove(storage.SeriesRef(ref), idx) } func (c *ActiveSeries) Clear() { @@ -394,6 +408,9 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } + // here if we have a cost attribution label, we can split the serie count based on the value of the label + // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly + s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -415,10 +432,13 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries) { +func (s *seriesStripe) reinitialize( + asm *asmodel.Matchers, + deleted *deletedSeries, + cat *costattribution.Tracker, +) { s.mu.Lock() defer s.mu.Unlock() - s.deleted = deleted s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} @@ -429,9 +449,10 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) + s.cat = cat } -func (s *seriesStripe) purge(keepUntil time.Time) { +func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { keepUntilNanos := keepUntil.UnixNano() if oldest := s.oldestEntryTs.Load(); oldest > 0 && keepUntilNanos <= oldest { // Nothing to do. @@ -449,12 +470,21 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) + buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { if entry.deleted { s.deleted.purge(ref) } + + if idx != nil { + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(buf.Labels(), keepUntil) + } delete(s.refs, ref) continue } @@ -489,7 +519,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { // This is mostly the same logic from purge() but we decrement counters for a single entry instead of incrementing for each entry. // Note: we might remove the oldest series here, but the worst thing can happen is that we let run a useless purge() cycle later, // so this method doesn't update the oldestEntryTs. -func (s *seriesStripe) remove(ref storage.SeriesRef) { +func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.mu.Lock() defer s.mu.Unlock() @@ -502,6 +532,14 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- + if idx != nil { + buf := labels.NewScratchBuilder(10) + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(buf.Labels(), time.Now()) + } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index cf821c5bca5..ca36450f823 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -37,10 +37,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - - valid := c.Purge(time.Now()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -50,8 +48,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveBuckets) assert.Empty(t, activeMatchingBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -62,8 +60,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -74,8 +72,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -86,8 +84,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -98,8 +96,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 5, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -111,8 +109,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 8, allActiveBuckets) // more buckets for a histogram - c.UpdateSeries(ls3, ref3, time.Now(), 7) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 7, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -124,8 +122,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 10, allActiveBuckets) // changing a metric from histogram to float - c.UpdateSeries(ls4, ref4, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -150,7 +148,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -162,7 +160,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // ref5 is created with the same labelset as ls1, it shouldn't be accounted as different series. - c.UpdateSeries(ls1, ref5, time.Now(), -1) + c.UpdateSeries(ls1, ref5, time.Now(), -1, nil) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) assert.Equal(t, 1, allActiveHistograms) @@ -173,7 +171,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -204,19 +202,19 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // The expected number of series is the total number of series minus the ttl // because the first ttl series should be purged exp := len(series) - (ttl) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -231,7 +229,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) } @@ -243,7 +241,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") ref6 := storage.SeriesRef(6) // same as ls2 - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -257,8 +255,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -272,8 +270,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -287,8 +285,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -302,8 +300,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -317,8 +315,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -332,8 +330,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 3, allActiveBuckets) - c.UpdateSeries(ls5, ref5, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls5, ref5, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -348,8 +346,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 8, allActiveBuckets) // changing a metric from float to histogram - c.UpdateSeries(ls3, ref3, time.Now(), 6) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 6, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -364,8 +362,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 14, allActiveBuckets) // fewer (zero) buckets for a histogram - c.UpdateSeries(ls4, ref4, time.Now(), 0) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 0, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -397,7 +395,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -412,7 +410,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // ls2 is pushed again, this time with ref6 - c.UpdateSeries(ls2, ref6, time.Now(), -1) + c.UpdateSeries(ls2, ref6, time.Now(), -1, nil) // Numbers don't change. allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -427,7 +425,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -448,7 +446,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) c.Clear() @@ -488,12 +486,11 @@ func labelsWithHashCollision() (labels.Labels, labels.Labels) { func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -517,22 +514,22 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ deletedRef: deletedLabels, }) - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) exp := len(series) - (ttl) // Purge is not intended to purge - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -563,13 +560,13 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute) + c := NewActiveSeries(asm, 5*time.Minute, nil) exp := len(series) - ttl expMatchingSeries := 0 for i, s := range series { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) // if this series is matching, and they're within the ttl tmp := asm.Matches(s) @@ -578,11 +575,11 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { } } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -596,28 +593,28 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) - c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 1, allActive) - c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) // This will *not* update the series, since there is already newer timestamp. - c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1) + c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -632,30 +629,30 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) assert.Equal(t, []int{1}, activeMatching) c.ReloadMatchers(asm, currentTime) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.False(t, valid) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls1, ref1, currentTime, -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -666,8 +663,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls3, ref3, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls3, ref3, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -681,8 +678,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls4, ref4, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls4, ref4, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -698,15 +695,15 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -717,10 +714,10 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -736,16 +733,15 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) currentTime := time.Now() - - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -757,11 +753,11 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -790,7 +786,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -824,7 +820,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo nextSeriesID = 0 } - c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1) + c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1, nil) } }(i) } @@ -841,7 +837,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo case <-stopPurge: return default: - c.Purge(future()) + c.Purge(future(), nil) } // Throttle, but keep high pressure from Purge(). @@ -928,10 +924,10 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { - c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) + c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) now++ } } @@ -953,7 +949,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} @@ -968,13 +964,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Prepare series for ix, s := range series { if ix < numExpiresSeries { - c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1) + c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1, nil) } else { - c.UpdateSeries(s, refs[ix], currentTime, -1) + c.UpdateSeries(s, refs[ix], currentTime, -1, nil) } } - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(b, numSeries, allActive) @@ -982,13 +978,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Purge is going to purge everything currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) if twice { - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 08edb6ab54c..2b3561a3530 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -52,6 +52,7 @@ import ( "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester/activeseries" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" @@ -314,6 +315,8 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService + costAttributionMgr *costattribution.Manager + tsdbMetrics *tsdbMetrics forceCompactTrigger chan requestWithUsersAndCallback @@ -368,8 +371,9 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), + bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), @@ -382,7 +386,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -391,6 +395,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -783,10 +788,13 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() { + newCostAttributionTracker := i.costAttributionMgr.TrackerForUser(userID) + if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - valid := userDB.activeSeries.Purge(now) + + idx, _ := userDB.Head().Index() + valid := userDB.activeSeries.Purge(now, idx) if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. i.metrics.activeSeriesLoading.WithLabelValues(userID).Set(1) @@ -1159,7 +1167,6 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Note that we don't .Finish() the span in this method on purpose spanlog := spanlogger.FromContext(ctx, i.logger) spanlog.DebugLog("event", "acquired append lock") - var ( startAppend = time.Now() @@ -1190,48 +1197,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, - func() { + func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1239,30 +1254,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1377,7 +1397,6 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, errProcessor *mimir_storage.SoftAppendErrorProcessor, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { - // Fetch limits once per push request both to avoid processing half the request differently. var ( nativeHistogramsIngestionEnabled = i.limits.NativeHistogramsIngestionEnabled(userID) @@ -1390,6 +1409,11 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels + + // idx is used to decrease active series count in case of error for cost attribution. + idx, _ := i.getTSDB(userID).Head().Index() + // TODO: deal with the error here + for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1405,8 +1429,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre allOutOfBoundsHistograms(ts.Histograms, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) - stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) + stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1424,10 +1449,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // ignore native histograms in the condition and statitics as well if outOfOrderWindow <= 0 && minAppendTimeAvailable && len(ts.Exemplars) == 0 && len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { - stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -1548,7 +1572,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } if activeSeries != nil && stats.succeededSamplesCount > oldSucceededSamplesCount { - activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets) + activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets, idx) } if len(ts.Exemplars) > 0 && i.limits.MaxGlobalExemplarsPerUser(userID) > 0 { @@ -2642,8 +2666,12 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD } userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout), + userID: userID, + activeSeries: activeseries.NewActiveSeries( + asmodel.NewMatchers(matchersConfig), + i.cfg.ActiveSeriesMetrics.IdleTimeout, + i.costAttributionMgr.TrackerForUser(userID), + ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3243,7 +3271,12 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now } // Purge the active series so that the next call to Active() will return the up-to-date count. - db.activeSeries.Purge(now) + idx, err := db.Head().Index() + if err != nil { + level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) + continue + } + db.activeSeries.Purge(now, idx) // Estimate the number of series that would be dropped from the TSDB Head if we would // compact the head up until "now - active series idle timeout". diff --git a/pkg/ingester/ingester_early_compaction_test.go b/pkg/ingester/ingester_early_compaction_test.go index 531d8a673f0..822e13374ee 100644 --- a/pkg/ingester/ingester_early_compaction_test.go +++ b/pkg/ingester/ingester_early_compaction_test.go @@ -129,7 +129,7 @@ func TestIngester_compactBlocksToReduceInMemorySeries_ShouldTriggerCompactionOnl require.Len(t, listBlocksInDir(t, userBlocksDir), 0) // Use a trick to track all series we've written so far as "inactive". - ingester.getTSDB(userID).activeSeries.Purge(now.Add(30 * time.Minute)) + ingester.getTSDB(userID).activeSeries.Purge(now.Add(30*time.Minute), nil) // Pre-condition check. require.Equal(t, uint64(10), ingester.getTSDB(userID).Head().NumSeries()) diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 4a529321155..fcf79dd4bc7 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -650,7 +650,7 @@ func createTestIngesterWithIngestStorage(t testing.TB, ingesterCfg *Config, over require.NoError(t, services.StopAndAwaitTerminated(ctx, prw)) }) - ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, reg, util_test.NewTestingLogger(t)) + ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, nil, reg, util_test.NewTestingLogger(t)) require.NoError(t, err) return ingester, kafkaCluster, prw diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 6d03bc83535..b4bbe219fd7 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -60,6 +60,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -3589,53 +3590,114 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) { } func BenchmarkIngesterPush(b *testing.B) { - registry := prometheus.NewRegistry() - ctx := user.InjectOrgID(context.Background(), userID) + costAttributionCases := []struct { + state string + limitsCfg func(*validation.Limits) + customRegistry *prometheus.Registry + }{ + { + state: "enabled", + limitsCfg: func(*validation.Limits) {}, + customRegistry: nil, + }, + { + state: "disabled", + limitsCfg: func(limits *validation.Limits) { + if limits == nil { + return + } + limits.CostAttributionLabels = []string{"cpu"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + customRegistry: prometheus.NewRegistry(), + }, + } - // Create a mocked ingester - cfg := defaultIngesterTestConfig(b) + tests := []struct { + name string + limitsCfg func() validation.Limits + }{ + { + name: "ingester push succeeded", + limitsCfg: func() validation.Limits { + limitsCfg := defaultLimitsTestConfig() + limitsCfg.NativeHistogramsIngestionEnabled = true + return limitsCfg + }, + }, + } - ingester, err := prepareIngesterWithBlocksStorage(b, cfg, nil, registry) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - defer services.StopAndAwaitTerminated(context.Background(), ingester) //nolint:errcheck + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for _, t := range tests { + b.Run(fmt.Sprintf("scenario=%s", t.name), func(b *testing.B) { + registry := prometheus.NewRegistry() + ctx := user.InjectOrgID(context.Background(), userID) - // Wait until the ingester is healthy - test.Poll(b, 100*time.Millisecond, 1, func() interface{} { - return ingester.lifecycler.HealthyInstancesCount() - }) + // Create a mocked ingester + cfg := defaultIngesterTestConfig(b) - // Push a single time series to set the TSDB min time. - metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} - startTime := util.TimeToMillis(time.Now()) + limitCfg := t.limitsCfg() + caCase.limitsCfg(&limitCfg) - currTimeReq := mimirpb.ToWriteRequest( - metricLabelAdapters, - []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, - nil, - nil, - mimirpb.API, - ) - _, err = ingester.Push(ctx, currTimeReq) - require.NoError(b, err) + overrides, err := validation.NewOverrides(limitCfg, nil) + require.NoError(b, err) - const ( - series = 10 - samples = 1 - ) + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } + + ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - allLabels, allSamples := benchmarkData(series) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) + }) - b.ResetTimer() - for iter := 0; iter < b.N; iter++ { - // Bump the timestamp on each of our test samples each time round the loop - for j := 0; j < samples; j++ { - for i := range allSamples { - allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + // Wait until the ingester is healthy + test.Poll(b, 100*time.Millisecond, 1, func() interface{} { + return ingester.lifecycler.HealthyInstancesCount() + }) + + // Push a single time series to set the TSDB min time. + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + startTime := util.TimeToMillis(time.Now()) + + currTimeReq := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, + nil, + nil, + mimirpb.API, + ) + _, err = ingester.Push(ctx, currTimeReq) + require.NoError(b, err) + + // so we are benchmark 5000 series with 10 sample each + const ( + series = 5000 + samples = 10 + ) + + allLabels, allSamples := benchmarkData(series) + + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + // Bump the timestamp on each of our test samples each time round the loop + for j := 0; j < samples; j++ { + for i := range allSamples { + allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + } + _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + require.NoError(b, err) + } + } + }) } - _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) - require.NoError(b, err) - } + }) } } @@ -6232,10 +6294,14 @@ func prepareIngesterWithBlocksStorageAndLimits(t testing.TB, ingesterCfg Config, } func prepareIngesterWithBlockStorageAndOverrides(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { - return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer) + return prepareIngesterWithBlockStorageOverridesAndCostAttribution(t, ingesterCfg, overrides, ingestersRing, dataDir, bucketDir, registerer, nil) +} + +func prepareIngesterWithBlockStorageOverridesAndCostAttribution(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { + return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer, cam) } -func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { +func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { // Create a data dir if none has been provided. if dataDir == "" { dataDir = t.TempDir() @@ -6256,7 +6322,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, cam, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } @@ -6462,7 +6528,7 @@ func TestIngester_OpenExistingTSDBOnStartup(t *testing.T) { // setup the tsdbs dir testData.setup(t, tempDir) - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) startErr := services.StartAndAwaitRunning(context.Background(), ingester) @@ -7622,7 +7688,7 @@ func TestHeadCompactionOnStartup(t *testing.T) { ingesterCfg.BlocksStorageConfig.Bucket.S3.Endpoint = "localhost" ingesterCfg.BlocksStorageConfig.TSDB.Retention = 2 * 24 * time.Hour // Make sure that no newly created blocks are deleted. - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingester)) diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 5a3ed82c28c..2f31f41892e 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,12 +619,14 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 + idx, _ := u.Head().Index() + // TODO: deal with the err here u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) { count++ } else { - u.activeSeries.Delete(refs[i]) + u.activeSeries.Delete(refs[i], idx) } } }) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 7bcd3eac250..31baea29e7e 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -52,6 +52,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -148,6 +149,9 @@ type Config struct { Common CommonConfig `yaml:"common"` TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` + CostAttributionRegistryPath string `yaml:"cost_attribution_registry_path" category:"experimental"` } // RegisterFlags registers flags. @@ -173,6 +177,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") + f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) @@ -739,6 +745,8 @@ type Mimir struct { BlockBuilderScheduler *blockbuilderscheduler.BlockBuilderScheduler ContinuousTestManager *continuoustest.Manager BuildInfoHandler http.Handler + + CostAttributionManager *costattribution.Manager } // New makes a new Mimir. diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 697501af98f..127a771b889 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -43,6 +43,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -80,6 +81,7 @@ const ( OverridesExporter string = "overrides-exporter" Server string = "server" ActiveGroupsCleanupService string = "active-groups-cleanup-service" + CostAttributionService string = "cost-attribution-service" Distributor string = "distributor" DistributorService string = "distributor-service" Ingester string = "ingester" @@ -462,7 +464,9 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.PreferAvailabilityZone = t.Cfg.Querier.PreferAvailabilityZone t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.ActiveGroupsCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, + t.ActiveGroupsCleanup, t.CostAttributionManager, t.IngesterRing, t.IngesterPartitionInstanceRing, + canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return } @@ -644,6 +648,18 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { return t.ActiveGroupsCleanup, nil } +func (t *Mimir) initCostAttributionService() (services.Service, error) { + // The cost attribution service is only initilized if the custom registry path is provided. + if t.Cfg.CostAttributionRegistryPath != "" { + reg := prometheus.NewRegistry() + var err error + t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.API.RegisterCostAttribution(t.Cfg.CostAttributionRegistryPath, reg) + return t.CostAttributionManager, err + } + return nil, nil +} + func (t *Mimir) tsdbIngesterConfig() { t.Cfg.Ingester.BlocksStorageConfig = t.Cfg.BlocksStorage } @@ -655,7 +671,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionManager, t.Registerer, util_log.Logger) if err != nil { return } @@ -1138,6 +1154,7 @@ func (t *Mimir) setupModuleManager() error { mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(ActiveGroupsCleanupService, t.initActiveGroupsCleanupService, modules.UserInvisibleModule) + mm.RegisterModule(CostAttributionService, t.initCostAttributionService, modules.UserInvisibleModule) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) mm.RegisterModule(Ingester, t.initIngester) @@ -1178,9 +1195,10 @@ func (t *Mimir) setupModuleManager() error { Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, - DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, + DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, + CostAttributionService: {API, Overrides}, Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, - IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, + IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation, Vault}, diff --git a/pkg/storage/soft_append_error_processor.go b/pkg/storage/soft_append_error_processor.go index 0f02131537d..6fdda3ae588 100644 --- a/pkg/storage/soft_append_error_processor.go +++ b/pkg/storage/soft_append_error_processor.go @@ -22,7 +22,7 @@ type SoftAppendErrorProcessor struct { errTooOldSample func(int64, []mimirpb.LabelAdapter) sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter) errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter) - maxSeriesPerUser func() + maxSeriesPerUser func(labels []mimirpb.LabelAdapter) maxSeriesPerMetric func(labels []mimirpb.LabelAdapter) errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter) errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter) @@ -39,7 +39,7 @@ func NewSoftAppendErrorProcessor( errTooOldSample func(int64, []mimirpb.LabelAdapter), sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter), errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter), - maxSeriesPerUser func(), + maxSeriesPerUser func([]mimirpb.LabelAdapter), maxSeriesPerMetric func(labels []mimirpb.LabelAdapter), errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter), errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter), @@ -89,7 +89,7 @@ func (e *SoftAppendErrorProcessor) ProcessErr(err error, ts int64, labels []mimi e.errDuplicateSampleForTimestamp(ts, labels) return true case errors.Is(err, globalerror.MaxSeriesPerUser): - e.maxSeriesPerUser() + e.maxSeriesPerUser(labels) return true case errors.Is(err, globalerror.MaxSeriesPerMetric): e.maxSeriesPerMetric(labels) diff --git a/pkg/streamingpromql/benchmarks/comparison_test.go b/pkg/streamingpromql/benchmarks/comparison_test.go index 5b26a5d6c45..4b147583d31 100644 --- a/pkg/streamingpromql/benchmarks/comparison_test.go +++ b/pkg/streamingpromql/benchmarks/comparison_test.go @@ -237,7 +237,7 @@ func createIngesterQueryable(t testing.TB, address string) storage.Queryable { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, ingestersRing, nil, false, nil, logger) + d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, nil, ingestersRing, nil, false, nil, logger) require.NoError(t, err) queryMetrics := stats.NewQueryMetrics(nil) diff --git a/pkg/streamingpromql/benchmarks/ingester.go b/pkg/streamingpromql/benchmarks/ingester.go index 6f3b5f04a9a..9107b66f64f 100644 --- a/pkg/streamingpromql/benchmarks/ingester.go +++ b/pkg/streamingpromql/benchmarks/ingester.go @@ -96,7 +96,7 @@ func startBenchmarkIngester(rootDataDir string) (*ingester.Ingester, string, fun return services.StopAndAwaitTerminated(context.Background(), ingestersRing) }) - ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, log.NewNopLogger()) + ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, nil, log.NewNopLogger()) if err != nil { cleanup() return nil, "", nil, fmt.Errorf("could not create ingester: %w", err) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index b12fc465fd0..9fc26f99b71 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -62,6 +62,8 @@ const ( QueryIngestersWithinFlag = "querier.query-ingesters-within" AlertmanagerMaxGrafanaConfigSizeFlag = "alertmanager.max-grafana-config-size-bytes" AlertmanagerMaxGrafanaStateSizeFlag = "alertmanager.max-grafana-state-size-bytes" + costAttributionLabelsFlag = "validation.cost-attribution-labels" + maxCostAttributionLabelsPerUserFlag = "validation.max-cost-attribution-labels-per-user" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -70,6 +72,7 @@ const ( var ( errInvalidIngestStorageReadConsistency = fmt.Errorf("invalid ingest storage read consistency (supported values: %s)", strings.Join(api.ReadConsistencies, ", ")) errInvalidMaxEstimatedChunksPerQueryMultiplier = errors.New("invalid value for -" + MaxEstimatedChunksPerQueryMultiplierFlag + ": must be 0 or greater than or equal to 1") + errCostAttributionLabelsLimitExceeded = errors.New("invalid value for -" + costAttributionLabelsFlag + ": exceeds the limit defined by -" + maxCostAttributionLabelsPerUserFlag) ) // LimitError is a marker interface for the errors that do not comply with the specified limits. @@ -187,6 +190,12 @@ type Limits struct { LabelValuesMaxCardinalityLabelNamesPerRequest int `yaml:"label_values_max_cardinality_label_names_per_request" json:"label_values_max_cardinality_label_names_per_request"` ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` + // Cost attribution and limit. + CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionLabelsPerUser int `yaml:"max_cost_attribution_labels_per_user" json:"max_cost_attribution_labels_per_user" category:"experimental"` + MaxCostAttributionCardinalityPerUser int `yaml:"max_cost_attribution_cardinality_per_user" json:"max_cost_attribution_cardinality_per_user" category:"experimental"` + CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` + // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` @@ -300,6 +309,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.") + f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user.") + f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -476,6 +489,10 @@ func (l *Limits) validate() error { return errInvalidIngestStorageReadConsistency } + if len(l.CostAttributionLabels) > l.MaxCostAttributionLabelsPerUser { + return errCostAttributionLabelsLimitExceeded + } + return nil } @@ -797,6 +814,22 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } +func (o *Overrides) CostAttributionLabels(userID string) []string { + return o.getOverridesForUser(userID).CostAttributionLabels +} + +func (o *Overrides) MaxCostAttributionLabelsPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionLabelsPerUser +} + +func (o *Overrides) CostAttributionCooldown(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).CostAttributionCooldown) +} + +func (o *Overrides) MaxCostAttributionCardinalityPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionCardinalityPerUser +} + // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 9dc82df2d05..c56cb1ab026 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1076,6 +1076,12 @@ metric_relabel_configs: cfg: `ingest_storage_read_consistency: xyz`, expectedErr: errInvalidIngestStorageReadConsistency.Error(), }, + "should fail when cost_attribution_labels exceed max_cost_attribution_labels_per_user": { + cfg: ` +cost_attribution_labels: label1, label2, label3, +max_cost_attribution_labels_per_user: 2`, + expectedErr: errCostAttributionLabelsLimitExceeded.Error(), + }, } for testName, testData := range tests {