Skip to content

Commit

Permalink
monitoring PR handling failures. (#42)
Browse files Browse the repository at this point in the history
Instrument  PR handling failures.

We track both explicit failures with pr_handle_failures_total
And cases where telefonistka commit status check  is left in "pending"
state (because Telefonistka exploded while handling that event )


Co-authored-by: Yazdan Mohammadi <[email protected]>
  • Loading branch information
Oded-B and yzdann authored Dec 13, 2024
1 parent a77da6a commit 8439b61
Show file tree
Hide file tree
Showing 6 changed files with 274 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cmd/telefonistka/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ func serve() {
mainGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128)
prApproverGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128)

go githubapi.MainGhMetricsLoop(mainGhClientCache)

mux := http.NewServeMux()
mux.HandleFunc("/webhook", handleWebhook(githubWebhookSecret, mainGhClientCache, prApproverGhClientCache))
mux.Handle("/metrics", promhttp.Handler())
Expand Down
23 changes: 23 additions & 0 deletions docs/observability.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
|telefonistka_github_github_rest_api_client_rate_remaining|gauge|The number of remaining requests the client can make this hour||
|telefonistka_github_github_rest_api_client_rate_limit|gauge|The number of requests per hour the client is currently limited to||
|telefonistka_webhook_server_webhook_hits_total|counter|The total number of validated webhook hits|`parsing`|
|telefonistka_github_open_prs|gauge|The number of open PRs|`repo_slug`|
|telefonistka_github_open_promotion_prs|gauge|The number of open promotion PRs|`repo_slug`|
|telefonistka_github_open_prs_with_pending_telefonistka_checks|gauge|The number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)|`repo_slug`|
|telefonistka_github_commit_status_updates_total|counter|The total number of commit status updates, and their status (success/pending/failure)|`repo_slug`, `status`|

> [!NOTE]
> telefonistka_github_*_prs metrics are only supported on installtions that uses GitHub App authentication as it provides an easy way to query the relevant GH repos.
Example metrics snippet:

Expand All @@ -26,4 +33,20 @@ telefonistka_github_github_rest_api_client_rate_remaining 99668
# HELP telefonistka_webhook_server_webhook_hits_total The total number of validated webhook hits
# TYPE telefonistka_webhook_server_webhook_hits_total counter
telefonistka_webhook_server_webhook_hits_total{parsing="successful"} 8
# HELP telefonistka_github_commit_status_updates_total The total number of commit status updates, and their status (success/pending/failure)
# TYPE telefonistka_github_commit_status_updates_total counter
telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="error"} 1
telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="pending"} 1
# HELP telefonistka_github_open_promotion_prs The total number of open PRs with promotion label
# TYPE telefonistka_github_open_promotion_prs gauge
telefonistka_github_open_promotion_prs{repo_slug="foo/bar1"} 0
telefonistka_github_open_promotion_prs{repo_slug="foo/bar2"} 10
# HELP telefonistka_github_open_prs The total number of open PRs
# TYPE telefonistka_github_open_prs gauge
telefonistka_github_open_prs{repo_slug="foo/bar1"} 0
telefonistka_github_open_prs{repo_slug="foo/bar2"} 21
# HELP telefonistka_github_open_prs_with_pending_telefonistka_checks The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)
# TYPE telefonistka_github_open_prs_with_pending_telefonistka_checks gauge
telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar1"} 0
telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar2"} 0
```
2 changes: 2 additions & 0 deletions internal/pkg/githubapi/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,8 @@ func SetCommitStatus(ghPrClientDetails GhPrClientDetails, state string) {

_, resp, err := ghPrClientDetails.GhClientPair.v3Client.Repositories.CreateStatus(ctx, ghPrClientDetails.Owner, ghPrClientDetails.Repo, ghPrClientDetails.PrSHA, commitStatus)
prom.InstrumentGhCall(resp)
repoSlug := ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo
prom.IncCommitStatusUpdateCounter(repoSlug, state)
if err != nil {
ghPrClientDetails.PrLogger.Errorf("Failed to set commit status: err=%s\n%v", err, resp)
}
Expand Down
108 changes: 108 additions & 0 deletions internal/pkg/githubapi/pr_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package githubapi

import (
"context"
"time"

"github.com/google/go-github/v62/github"
lru "github.com/hashicorp/golang-lru/v2"
log "github.com/sirupsen/logrus"
prom "github.com/wayfair-incubator/telefonistka/internal/pkg/prometheus"
)

const (
timeToDefineStale = 20 * time.Minute
metricRefreshTime = 60 * time.Second
)

func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) {
for t := range time.Tick(metricRefreshTime) {
log.Debugf("Updating pr metrics at %v", t)
getPrMetrics(mainGhClientCache)
}
}

func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (pc prom.PrCounters, err error) {
log.Debugf("Checking repo %s", repo.GetName())
ghOwner := repo.GetOwner().GetLogin()
prListOpts := &github.PullRequestListOptions{
State: "open",
}
prs := []*github.PullRequest{}

// paginate through PRs, there might be lots of them.
for {
perPagePrs, resp, err := ghClient.v3Client.PullRequests.List(ctx, ghOwner, repo.GetName(), prListOpts)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting PRs for %s/%s: %v", ghOwner, repo.GetName(), err)
}
prs = append(prs, perPagePrs...)
if resp.NextPage == 0 {
break
}
prListOpts.Page = resp.NextPage
}

for _, pr := range prs {
if DoesPrHasLabel(pr.Labels, "promotion") {
pc.OpenPromotionPrs++
}

log.Debugf("Checking PR %d", pr.GetNumber())
commitStatuses, resp, err := ghClient.v3Client.Repositories.GetCombinedStatus(ctx, ghOwner, repo.GetName(), pr.GetHead().GetSHA(), nil)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err)
continue
}
if isPrStalePending(commitStatuses, timeToDefineStale) {
pc.PrWithStaleChecks++
}
}
pc.OpenPrs = len(prs)

return
}

// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than timeToDefineStale and is in pending state
func isPrStalePending(commitStatuses *github.CombinedStatus, timeToDefineStale time.Duration) bool {
for _, status := range commitStatuses.Statuses {
if *status.Context == "telefonistka" &&
*status.State == "pending" &&
status.UpdatedAt.GetTime().Before(time.Now().Add(timeToDefineStale*-1)) {
log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State)
return true
} else {
log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State)
}
}

return false
}

// getPrMetrics iterates through all clients , gets all repos and then all PRs and calculates metrics
// getPrMetrics assumes Telefonsitka uses a GitHub App style of authentication as it uses the Apps.ListRepos call
// When using personal access token authentication, Telefonistka is unaware of the "relevant" repos (at least it get a webhook from them).
func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
for _, ghOwner := range mainGhClientCache.Keys() {
log.Debugf("Checking gh Owner %s", ghOwner)
ghClient, _ := mainGhClientCache.Get(ghOwner)
repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting repos for %s: %v", ghOwner, err)
continue
}
for _, repo := range repos.Repositories {
pc, err := getRepoPrMetrics(ctx, ghClient, repo)
if err != nil {
log.Errorf("error getting repos for %s: %v", ghOwner, err)
continue
}
prom.PublishPrMetrics(pc, repo.GetFullName())
}
}
}
89 changes: 89 additions & 0 deletions internal/pkg/githubapi/pr_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package githubapi

import (
"testing"
"time"

"github.com/google/go-github/v62/github"
)

func TestIsPrStalePending(t *testing.T) {
t.Parallel()
timeToDefineStale := 15 * time.Minute

currentTime := time.Now()
tests := map[string]struct {
input github.CombinedStatus
result bool
}{
"All Success": {
input: github.CombinedStatus{
Statuses: []*github.RepoStatus{
{
State: github.String("success"),
Context: github.String("telefonistka"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-10 * time.Minute),
},
},
{
State: github.String("success"),
Context: github.String("circleci"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-10 * time.Minute),
},
},
{
State: github.String("success"),
Context: github.String("foobar"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-10 * time.Minute),
},
},
},
},
result: false,
},
"Pending but not stale": {
input: github.CombinedStatus{
Statuses: []*github.RepoStatus{
{
State: github.String("pending"),
Context: github.String("telefonistka"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-1 * time.Minute),
},
},
},
},
result: false,
},

"Pending and stale": {
input: github.CombinedStatus{
Statuses: []*github.RepoStatus{
{
State: github.String("pending"),
Context: github.String("telefonistka"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-20 * time.Minute),
},
},
},
},
result: true,
},
}

for name, tc := range tests {
name := name
tc := tc
t.Run(name, func(t *testing.T) {
t.Parallel()
result := isPrStalePending(&tc.input, timeToDefineStale)
if result != tc.result {
t.Errorf("(%s)Expected %v, got %v", name, tc.result, result)
}
})
}
}
50 changes: 50 additions & 0 deletions internal/pkg/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ import (
"github.com/prometheus/client_golang/prometheus/promauto"
)

type PrCounters struct {
OpenPrs int
OpenPromotionPrs int
PrWithStaleChecks int
}

var (
webhookHitsVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "webhook_hits_total",
Expand Down Expand Up @@ -39,6 +45,34 @@ var (
Subsystem: "github",
}, []string{"api_group", "api_path", "repo_slug", "status", "method"})

ghOpenPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_prs",
Help: "The total number of open PRs",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

ghOpenPromotionPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_promotion_prs",
Help: "The total number of open PRs with promotion label",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

ghOpenPrsWithPendingCheckGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_prs_with_pending_telefonistka_checks",
Help: "The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

commitStatusUpdates = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "commit_status_updates_total",
Help: "The total number of commit status updates, and their status (success/pending/failure)",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug", "status"})

whUpstreamRequestsCountVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "upstream_requests_total",
Help: "The total number of requests forwarded upstream servers",
Expand All @@ -47,6 +81,22 @@ var (
}, []string{"status", "method", "url"})
)

func IncCommitStatusUpdateCounter(repoSlug string, status string) {
commitStatusUpdates.With(prometheus.Labels{
"repo_slug": repoSlug,
"status": status,
}).Inc()
}

func PublishPrMetrics(pc PrCounters, repoSlug string) {
metricLables := prometheus.Labels{
"repo_slug": repoSlug,
}
ghOpenPrsGauge.With(metricLables).Set(float64(pc.OpenPrs))
ghOpenPromotionPrsGauge.With(metricLables).Set(float64(pc.OpenPromotionPrs))
ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(pc.PrWithStaleChecks))
}

// This function instrument Webhook hits and parsing of their content
func InstrumentWebhookHit(parsing_status string) {
webhookHitsVec.With(prometheus.Labels{"parsing": parsing_status}).Inc()
Expand Down

0 comments on commit 8439b61

Please sign in to comment.