diff --git a/CHANGELOG.md b/CHANGELOG.md index 12d4f1f9502..301da4dbf95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Added +- [#8492](https://github.com/thanos-io/thanos/pull/8492) Add `thanos_query_endpoints` metric to track healthy/unhealthy endpoints. ### Changed diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index bbf2612972a..25372d6cd2f 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -17,6 +17,7 @@ import ( "github.com/go-kit/log/level" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/prometheus/model/labels" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -218,9 +219,10 @@ type EndpointSet struct { updateMtx sync.Mutex - endpointsMtx sync.RWMutex - endpoints map[string]*endpointRef - endpointsMetric *endpointSetNodeCollector + endpointsMtx sync.RWMutex + endpoints map[string]*endpointRef + endpointsMetric *endpointSetNodeCollector + endpointsStatusCount *prometheus.GaugeVec // Track if the first update has completed firstUpdateOnce sync.Once @@ -271,6 +273,13 @@ func NewEndpointSet( }, endpoints: make(map[string]*endpointRef), firstUpdateChan: make(chan struct{}), + endpointsStatusCount: promauto.With(reg).NewGaugeVec( + prometheus.GaugeOpts{ + Name: "thanos_query_endpoints", + Help: "Number of endpoints connected to the querier categorized by healthy/unhealthy. Strict endpoints are never considered as unhealthy.", + }, + []string{"status"}, + ), } } @@ -395,6 +404,15 @@ func (e *EndpointSet) Update(ctx context.Context) { e.endpointsMetric.Update(stats) + activeCount := len(e.endpoints) + specsCount := len(e.endpointSpecs()) + inactiveCount := specsCount - activeCount + if inactiveCount < 0 { + inactiveCount = 0 + } + e.endpointsStatusCount.WithLabelValues("healthy").Set(float64(activeCount)) + e.endpointsStatusCount.WithLabelValues("unhealthy").Set(float64(inactiveCount)) + // Signal that the first update has completed e.firstUpdateOnce.Do(func() { close(e.firstUpdateChan)