Skip to content

Commit

Permalink
fix(healthcheck): correctly label nodes removed from metrics (#4178)
Browse files Browse the repository at this point in the history
Commits a75df89 and 3cdd20d added and started setting
new labels (dc/rack) to the healthcheck metrics.
Unfortunately, those labels weren't taken into consideration
when removing labels of removed clusters/nodes.
This resulted in not removing those metrics until SM restart.

This commit fixes this issue and brings a small refactor
to the way in which healthcheck labels are applied,
so that it's more difficult to make such mistake in the future.

Fixes #4017
  • Loading branch information
Michal-Leszczynski authored Dec 18, 2024
1 parent cafa851 commit a16e511
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 45 deletions.
66 changes: 47 additions & 19 deletions pkg/service/healthcheck/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,85 @@ import (
)

const (
clusterKey = "cluster"
hostKey = "host"
dcKey = "dc"
pingTypeKey = "ping_type"
rackKey = "rack"
clusterKey = "cluster"
dcKey = "dc"
rackKey = "rack"
hostKey = "host"

metricBufferSize = 100
)

func labelNames() []string {
return []string{clusterKey, dcKey, rackKey, hostKey}
}

type labels struct {
cluster string
dc string
rack string
host string
}

func newLabels(cluster, dc, rack, host string) labels {
return labels{
cluster: cluster,
dc: dc,
rack: rack,
host: host,
}
}

func (ls labels) promLabels() prometheus.Labels {
return prometheus.Labels{
clusterKey: ls.cluster,
dcKey: ls.dc,
rackKey: ls.rack,
hostKey: ls.host,
}
}

var (
cqlStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "scylla_manager",
Subsystem: "healthcheck",
Name: "cql_status",
Help: "Host native port status. -2 stands for unavailable agent, -1 for unavailable Scylla and 1 for everything is fine.",
}, []string{clusterKey, hostKey, dcKey, rackKey})
}, labelNames())

cqlRTT = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "scylla_manager",
Subsystem: "healthcheck",
Name: "cql_rtt_ms",
Help: "Host native port RTT.",
}, []string{clusterKey, hostKey, dcKey, rackKey})
}, labelNames())

restStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "scylla_manager",
Subsystem: "healthcheck",
Name: "rest_status",
Help: "Host REST status. -2 stands for unavailable agent, -1 for unavailable Scylla and 1 for everything is fine.",
}, []string{clusterKey, hostKey, dcKey, rackKey})
}, labelNames())

restRTT = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "scylla_manager",
Subsystem: "healthcheck",
Name: "rest_rtt_ms",
Help: "Host REST RTT.",
}, []string{clusterKey, hostKey, dcKey, rackKey})
}, labelNames())

alternatorStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "scylla_manager",
Subsystem: "healthcheck",
Name: "alternator_status",
Help: "Host Alternator status. -2 stands for unavailable agent, -1 for unavailable Scylla and 1 for everything is fine.",
}, []string{clusterKey, hostKey, dcKey, rackKey})
}, labelNames())

alternatorRTT = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "scylla_manager",
Subsystem: "healthcheck",
Name: "alternator_rtt_ms",
Help: "Host Alternator RTT.",
}, []string{clusterKey, hostKey, dcKey, rackKey})
}, labelNames())
)

func init() {
Expand All @@ -72,29 +100,29 @@ func init() {
)
}

func apply(metrics []prometheus.Metric, f func(cluster, dc, host, pt string, v float64)) {
func apply(metrics []prometheus.Metric, f func(ls labels, v float64)) {
for _, m := range metrics {
metric := &dto.Metric{}
if err := m.Write(metric); err != nil {
continue
}
var c, dc, h, pt string

var c, dc, r, h string
for _, l := range metric.GetLabel() {
if l.GetName() == clusterKey {
c = l.GetValue()
}
if l.GetName() == hostKey {
h = l.GetValue()
}
if l.GetName() == dcKey {
dc = l.GetValue()
}
if l.GetName() == pingTypeKey {
pt = l.GetValue()
if l.GetName() == rackKey {
r = l.GetValue()
}
if l.GetName() == hostKey {
h = l.GetValue()
}
}
f(c, dc, h, pt, metric.GetGauge().GetValue())
f(newLabels(c, dc, r, h), metric.GetGauge().GetValue())
}
}

Expand Down
41 changes: 15 additions & 26 deletions pkg/service/healthcheck/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,13 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, addresses [
if err == nil {
rtt, err = r.ping(ctx, clusterID, addresses[i], r.timeout, ni)
}
hl := prometheus.Labels{
clusterKey: clusterID.String(),
hostKey: addresses[i],
rackKey: ni.Rack,
dcKey: ni.Datacenter,
}
promLs := newLabels(clusterID.String(), ni.Datacenter, ni.Rack, addresses[i]).promLabels()
if err != nil {
r.metrics.status.With(hl).Set(-1)
r.metrics.status.With(promLs).Set(-1)
} else {
r.metrics.status.With(hl).Set(1)
r.metrics.status.With(promLs).Set(1)
}
r.metrics.rtt.With(hl).Set(float64(rtt.Milliseconds()))
r.metrics.rtt.With(promLs).Set(float64(rtt.Milliseconds()))

return err
}
Expand All @@ -108,34 +103,28 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, addresses [
}

func (r runner) removeMetricsForCluster(clusterID uuid.UUID) {
apply(collect(r.metrics.status), func(cluster, _, host, _ string, _ float64) {
if clusterID.String() != cluster {
apply(collect(r.metrics.status), func(ls labels, _ float64) {
if clusterID.String() != ls.cluster {
return
}

hl := prometheus.Labels{
clusterKey: clusterID.String(),
hostKey: host,
}
r.metrics.status.Delete(hl)
r.metrics.rtt.Delete(hl)
promLs := ls.promLabels()
r.metrics.status.Delete(promLs)
r.metrics.rtt.Delete(promLs)
})
}

func (r runner) removeMetricsForMissingHosts(clusterID uuid.UUID, addresses []string) {
apply(collect(r.metrics.status), func(cluster, _, host, _ string, _ float64) {
if clusterID.String() != cluster {
apply(collect(r.metrics.status), func(ls labels, _ float64) {
if clusterID.String() != ls.cluster {
return
}
if slice.ContainsString(addresses, host) {
if slice.ContainsString(addresses, ls.host) {
return
}

l := prometheus.Labels{
clusterKey: clusterID.String(),
hostKey: host,
}
r.metrics.status.Delete(l)
r.metrics.rtt.Delete(l)
promLs := ls.promLabels()
r.metrics.status.Delete(promLs)
r.metrics.rtt.Delete(promLs)
})
}

0 comments on commit a16e511

Please sign in to comment.