From a16e511f5ca2f7e3216c51b5d8c7d77286ff472b Mon Sep 17 00:00:00 2001 From: Michal-Leszczynski <74614433+Michal-Leszczynski@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:38:22 +0100 Subject: [PATCH] fix(healthcheck): correctly label nodes removed from metrics (#4178) Commits a75df89c and 3cdd20dc added and started setting new labels (dc/rack) to the healthcheck metrics. Unfortunately, those labels weren't taken into consideration when removing labels of removed clusters/nodes. This resulted in not removing those metrics until SM restart. This commit fixes this issue and brings a small refactor to the way in which healthcheck labels are applied, so that it's more difficult to make such mistake in the future. Fixes #4017 --- pkg/service/healthcheck/metrics.go | 66 +++++++++++++++++++++--------- pkg/service/healthcheck/runner.go | 41 +++++++------------ 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/pkg/service/healthcheck/metrics.go b/pkg/service/healthcheck/metrics.go index aa7f6d497..0c3184905 100644 --- a/pkg/service/healthcheck/metrics.go +++ b/pkg/service/healthcheck/metrics.go @@ -8,57 +8,85 @@ import ( ) const ( - clusterKey = "cluster" - hostKey = "host" - dcKey = "dc" - pingTypeKey = "ping_type" - rackKey = "rack" + clusterKey = "cluster" + dcKey = "dc" + rackKey = "rack" + hostKey = "host" metricBufferSize = 100 ) +func labelNames() []string { + return []string{clusterKey, dcKey, rackKey, hostKey} +} + +type labels struct { + cluster string + dc string + rack string + host string +} + +func newLabels(cluster, dc, rack, host string) labels { + return labels{ + cluster: cluster, + dc: dc, + rack: rack, + host: host, + } +} + +func (ls labels) promLabels() prometheus.Labels { + return prometheus.Labels{ + clusterKey: ls.cluster, + dcKey: ls.dc, + rackKey: ls.rack, + hostKey: ls.host, + } +} + var ( cqlStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "scylla_manager", Subsystem: "healthcheck", Name: "cql_status", Help: "Host native port status. -2 stands for unavailable agent, -1 for unavailable Scylla and 1 for everything is fine.", - }, []string{clusterKey, hostKey, dcKey, rackKey}) + }, labelNames()) cqlRTT = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "scylla_manager", Subsystem: "healthcheck", Name: "cql_rtt_ms", Help: "Host native port RTT.", - }, []string{clusterKey, hostKey, dcKey, rackKey}) + }, labelNames()) restStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "scylla_manager", Subsystem: "healthcheck", Name: "rest_status", Help: "Host REST status. -2 stands for unavailable agent, -1 for unavailable Scylla and 1 for everything is fine.", - }, []string{clusterKey, hostKey, dcKey, rackKey}) + }, labelNames()) restRTT = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "scylla_manager", Subsystem: "healthcheck", Name: "rest_rtt_ms", Help: "Host REST RTT.", - }, []string{clusterKey, hostKey, dcKey, rackKey}) + }, labelNames()) alternatorStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "scylla_manager", Subsystem: "healthcheck", Name: "alternator_status", Help: "Host Alternator status. -2 stands for unavailable agent, -1 for unavailable Scylla and 1 for everything is fine.", - }, []string{clusterKey, hostKey, dcKey, rackKey}) + }, labelNames()) alternatorRTT = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "scylla_manager", Subsystem: "healthcheck", Name: "alternator_rtt_ms", Help: "Host Alternator RTT.", - }, []string{clusterKey, hostKey, dcKey, rackKey}) + }, labelNames()) ) func init() { @@ -72,29 +100,29 @@ func init() { ) } -func apply(metrics []prometheus.Metric, f func(cluster, dc, host, pt string, v float64)) { +func apply(metrics []prometheus.Metric, f func(ls labels, v float64)) { for _, m := range metrics { metric := &dto.Metric{} if err := m.Write(metric); err != nil { continue } - var c, dc, h, pt string + var c, dc, r, h string for _, l := range metric.GetLabel() { if l.GetName() == clusterKey { c = l.GetValue() } - if l.GetName() == hostKey { - h = l.GetValue() - } if l.GetName() == dcKey { dc = l.GetValue() } - if l.GetName() == pingTypeKey { - pt = l.GetValue() + if l.GetName() == rackKey { + r = l.GetValue() + } + if l.GetName() == hostKey { + h = l.GetValue() } } - f(c, dc, h, pt, metric.GetGauge().GetValue()) + f(newLabels(c, dc, r, h), metric.GetGauge().GetValue()) } } diff --git a/pkg/service/healthcheck/runner.go b/pkg/service/healthcheck/runner.go index 71ed7faa6..a8e41a743 100644 --- a/pkg/service/healthcheck/runner.go +++ b/pkg/service/healthcheck/runner.go @@ -86,18 +86,13 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, addresses [ if err == nil { rtt, err = r.ping(ctx, clusterID, addresses[i], r.timeout, ni) } - hl := prometheus.Labels{ - clusterKey: clusterID.String(), - hostKey: addresses[i], - rackKey: ni.Rack, - dcKey: ni.Datacenter, - } + promLs := newLabels(clusterID.String(), ni.Datacenter, ni.Rack, addresses[i]).promLabels() if err != nil { - r.metrics.status.With(hl).Set(-1) + r.metrics.status.With(promLs).Set(-1) } else { - r.metrics.status.With(hl).Set(1) + r.metrics.status.With(promLs).Set(1) } - r.metrics.rtt.With(hl).Set(float64(rtt.Milliseconds())) + r.metrics.rtt.With(promLs).Set(float64(rtt.Milliseconds())) return err } @@ -108,34 +103,28 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, addresses [ } func (r runner) removeMetricsForCluster(clusterID uuid.UUID) { - apply(collect(r.metrics.status), func(cluster, _, host, _ string, _ float64) { - if clusterID.String() != cluster { + apply(collect(r.metrics.status), func(ls labels, _ float64) { + if clusterID.String() != ls.cluster { return } - hl := prometheus.Labels{ - clusterKey: clusterID.String(), - hostKey: host, - } - r.metrics.status.Delete(hl) - r.metrics.rtt.Delete(hl) + promLs := ls.promLabels() + r.metrics.status.Delete(promLs) + r.metrics.rtt.Delete(promLs) }) } func (r runner) removeMetricsForMissingHosts(clusterID uuid.UUID, addresses []string) { - apply(collect(r.metrics.status), func(cluster, _, host, _ string, _ float64) { - if clusterID.String() != cluster { + apply(collect(r.metrics.status), func(ls labels, _ float64) { + if clusterID.String() != ls.cluster { return } - if slice.ContainsString(addresses, host) { + if slice.ContainsString(addresses, ls.host) { return } - l := prometheus.Labels{ - clusterKey: clusterID.String(), - hostKey: host, - } - r.metrics.status.Delete(l) - r.metrics.rtt.Delete(l) + promLs := ls.promLabels() + r.metrics.status.Delete(promLs) + r.metrics.rtt.Delete(promLs) }) }