Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Feb 19, 2025
1 parent 1f9bb3e commit 727a08c
Show file tree
Hide file tree
Showing 2 changed files with 301 additions and 13 deletions.
64 changes: 51 additions & 13 deletions internal/checks/promql_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type PromqlSeriesSettings struct {
LookbackRange string `hcl:"lookbackRange,optional" json:"lookbackRange,omitempty"`
LookbackStep string `hcl:"lookbackStep,optional" json:"lookbackStep,omitempty"`
IgnoreMetrics []string `hcl:"ignoreMetrics,optional" json:"ignoreMetrics,omitempty"`
IgnoreMatching []string `hcl:"ignoreMatching,optional" json:"ignoreMatching,omitempty"`
FallbackTimeout string `hcl:"fallbackTimeout,optional" json:"fallbackTimeout,omitempty"`
ignoreMetricsRe []*regexp.Regexp
lookbackRangeDuration time.Duration
Expand Down Expand Up @@ -76,6 +77,12 @@ func (c *PromqlSeriesSettings) Validate() error {
}
}

for _, selector := range c.IgnoreMatching {
if _, err := promParser.ParseMetricSelector(selector); err != nil {
return fmt.Errorf("%q is not a valid PromQL metric selector: %w", selector, err)
}
}

return nil
}

Expand Down Expand Up @@ -310,13 +317,15 @@ func (c SeriesCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Ru
),
Bug,
)
problems = append(problems, Problem{
Lines: expr.Value.Lines,
Reporter: c.Reporter(),
Text: text,
Details: c.checkOtherServer(ctx, selector.String(), settings.fallbackTimeout),
Severity: severity,
})
if details, shouldReport := c.checkOtherServer(ctx, selector.String(), settings); shouldReport {
problems = append(problems, Problem{
Lines: expr.Value.Lines,
Reporter: c.Reporter(),
Text: text,
Details: details,
Severity: severity,
})
}
slog.Debug("No historical series for base metric", slog.String("check", c.Reporter()), slog.String("selector", (&bareSelector).String()))
continue
}
Expand Down Expand Up @@ -578,7 +587,7 @@ func (c SeriesCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Ru
return problems
}

func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, timeout time.Duration) string {
func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, settings *PromqlSeriesSettings) (string, bool) {
var servers []*promapi.FailoverGroup
if val := ctx.Value(promapi.AllPrometheusServers); val != nil {
for _, s := range val.([]*promapi.FailoverGroup) {
Expand All @@ -590,7 +599,7 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, timeout
}

if len(servers) == 0 {
return SeriesCheckCommonProblemDetails
return SeriesCheckCommonProblemDetails, true
}

var suffix string
Expand All @@ -602,13 +611,13 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, timeout
start := time.Now()
var tested, matches, skipped int
for _, prom := range servers {
if time.Since(start) >= timeout {
if time.Since(start) >= settings.fallbackTimeout {
slog.Debug("Time limit reached for checking if metric exists on any other Prometheus server",
slog.String("check", c.Reporter()),
slog.String("selector", query),
)
suffix = fmt.Sprintf("\npint tried to check %d server(s) but stopped after checking %d server(s) due to reaching time limit (%s).\n",
len(servers), tested, output.HumanizeDuration(timeout))
len(servers), tested, output.HumanizeDuration(settings.fallbackTimeout))
break
}

Expand All @@ -632,6 +641,13 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, timeout
uri := prom.URI()

if series > 0 {
for _, selector := range settings.IgnoreMatching {
m, _ := promParser.ParseMetricSelector(selector)
if c.hasSeriesWithSelector(ctx, prom, query, m) {
return "", false
}
}

matches++
if matches > 10 {
skipped++
Expand All @@ -656,10 +672,32 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, timeout
buf.WriteString("\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n")

if matches > 0 {
return buf.String()
return buf.String(), true
}

return SeriesCheckCommonProblemDetails, true
}

func (c SeriesCheck) hasSeriesWithSelector(ctx context.Context, prom *promapi.FailoverGroup, query string, matchers []*labels.Matcher) bool {
qr, err := prom.Query(ctx, query)
if err != nil {
return false
}

return SeriesCheckCommonProblemDetails
for _, s := range qr.Series {
var seriesMatches bool
for _, m := range matchers {
s.Labels.Range(func(l labels.Label) {
if m.Name == l.Name && m.Matches(l.Value) {
seriesMatches = true
}
})
}
if seriesMatches {
return true
}
}
return false
}

func (c SeriesCheck) queryProblem(err error, expr parser.PromQLExpr) Problem {
Expand Down
250 changes: 250 additions & 0 deletions internal/checks/promql_series_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4347,6 +4347,256 @@ func TestSeriesCheck(t *testing.T) {
},
},
},
{
description: "metric missing but found on others / ignoreMatching match",
content: `
- record: foo
expr: sum(foo)
`,
checker: newSeriesCheck,
ctx: func(ctx context.Context, _ string) context.Context {
s := checks.PromqlSeriesSettings{
IgnoreMatching: []string{`{job="bar"}`},
}
if err := s.Validate(); err != nil {
t.Error(err)
t.FailNow()
}
return context.WithValue(ctx, checks.SettingsKey(checks.SeriesCheckName), &s)
},
otherProms: func(uri string) []*promapi.FailoverGroup {
var proms []*promapi.FailoverGroup
for i := range 30 {
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
}
return proms
},
prometheus: newSimpleProm,
problems: noProblems,
mocks: []*prometheusMock{
{
conds: []requestCondition{
requestPathCond{path: "/other" + promapi.APIPathQuery},
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithSingleInstantVector(),
},
{
conds: []requestCondition{
requestPathCond{path: "/other" + promapi.APIPathQuery},
formCond{key: "query", value: `foo`},
},
resp: vectorResponse{
samples: []*model.Sample{
generateSample(map[string]string{
"instance": "instance1",
"job": "foo",
}),
generateSample(map[string]string{
"instance": "instance2",
"job": "foo",
}),
generateSample(map[string]string{
"instance": "instance3",
"job": "bar",
}),
},
},
},
{
conds: []requestCondition{
requireQueryPath,
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithEmptyMatrix(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: "count(up)"},
},
resp: respondWithSingleRangeVector1W(),
},
},
},
{
description: "metric missing but found on others / ignoreMatching mismatch",
content: `
- record: foo
expr: sum(foo)
`,
checker: newSeriesCheck,
ctx: func(ctx context.Context, _ string) context.Context {
s := checks.PromqlSeriesSettings{
IgnoreMatching: []string{`{job="bar"}`},
}
if err := s.Validate(); err != nil {
t.Error(err)
t.FailNow()
}
return context.WithValue(ctx, checks.SettingsKey(checks.SeriesCheckName), &s)
},
otherProms: func(uri string) []*promapi.FailoverGroup {
var proms []*promapi.FailoverGroup
for i := range 30 {
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
}
return proms
},
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 3,
Last: 3,
},
Reporter: checks.SeriesCheckName,
Text: noMetricText("prom", uri, "foo", "1w"),
Details: fmt.Sprintf("`foo` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=foo)\n- [prom1](%s/other/graph?g0.expr=foo)\n- [prom2](%s/other/graph?g0.expr=foo)\n- [prom3](%s/other/graph?g0.expr=foo)\n- [prom4](%s/other/graph?g0.expr=foo)\n- [prom5](%s/other/graph?g0.expr=foo)\n- [prom6](%s/other/graph?g0.expr=foo)\n- [prom7](%s/other/graph?g0.expr=foo)\n- [prom8](%s/other/graph?g0.expr=foo)\n- [prom9](%s/other/graph?g0.expr=foo)\n- and 20 other server(s).\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n",
uri, uri, uri, uri, uri, uri, uri, uri, uri, uri),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{
requestPathCond{path: "/other" + promapi.APIPathQuery},
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithSingleInstantVector(),
},
{
conds: []requestCondition{
requestPathCond{path: "/other" + promapi.APIPathQuery},
formCond{key: "query", value: `foo`},
},
resp: vectorResponse{
samples: []*model.Sample{
generateSample(map[string]string{
"instance": "instance1",
"job": "foo",
}),
generateSample(map[string]string{
"instance": "instance2",
"job": "foo",
}),
generateSample(map[string]string{
"instance": "instance3",
"job": "bar1",
}),
},
},
},
{
conds: []requestCondition{
requireQueryPath,
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithEmptyMatrix(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: "count(up)"},
},
resp: respondWithSingleRangeVector1W(),
},
},
},
{
description: "metric missing but found on others / ignoreMatching error",
content: `
- record: foo
expr: sum(foo)
`,
checker: newSeriesCheck,
ctx: func(ctx context.Context, _ string) context.Context {
s := checks.PromqlSeriesSettings{
IgnoreMatching: []string{`{job="bar"}`},
}
if err := s.Validate(); err != nil {
t.Error(err)
t.FailNow()
}
return context.WithValue(ctx, checks.SettingsKey(checks.SeriesCheckName), &s)
},
otherProms: func(uri string) []*promapi.FailoverGroup {
var proms []*promapi.FailoverGroup
for i := range 30 {
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
}
return proms
},
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 3,
Last: 3,
},
Reporter: checks.SeriesCheckName,
Text: noMetricText("prom", uri, "foo", "1w"),
Details: fmt.Sprintf("`foo` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=foo)\n- [prom1](%s/other/graph?g0.expr=foo)\n- [prom2](%s/other/graph?g0.expr=foo)\n- [prom3](%s/other/graph?g0.expr=foo)\n- [prom4](%s/other/graph?g0.expr=foo)\n- [prom5](%s/other/graph?g0.expr=foo)\n- [prom6](%s/other/graph?g0.expr=foo)\n- [prom7](%s/other/graph?g0.expr=foo)\n- [prom8](%s/other/graph?g0.expr=foo)\n- [prom9](%s/other/graph?g0.expr=foo)\n- and 20 other server(s).\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n",
uri, uri, uri, uri, uri, uri, uri, uri, uri, uri),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{
requestPathCond{path: "/other" + promapi.APIPathQuery},
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithSingleInstantVector(),
},
{
conds: []requestCondition{
requestPathCond{path: "/other" + promapi.APIPathQuery},
formCond{key: "query", value: `foo`},
},
resp: respondWithInternalError(),
},
{
conds: []requestCondition{
requireQueryPath,
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(foo)`},
},
resp: respondWithEmptyMatrix(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: "count(up)"},
},
resp: respondWithSingleRangeVector1W(),
},
},
},
}
runTests(t, testCases)
}

0 comments on commit 727a08c

Please sign in to comment.