diff --git a/cmd/telefonistka/server.go b/cmd/telefonistka/server.go index 076b28f..0477a42 100644 --- a/cmd/telefonistka/server.go +++ b/cmd/telefonistka/server.go @@ -57,6 +57,8 @@ func serve() { mainGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128) prApproverGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128) + go githubapi.MainGhMetricsLoop(mainGhClientCache) + mux := http.NewServeMux() mux.HandleFunc("/webhook", handleWebhook(githubWebhookSecret, mainGhClientCache, prApproverGhClientCache)) mux.Handle("/metrics", promhttp.Handler()) diff --git a/docs/observability.md b/docs/observability.md index 03ed81d..9304b6a 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -6,6 +6,13 @@ |telefonistka_github_github_rest_api_client_rate_remaining|gauge|The number of remaining requests the client can make this hour|| |telefonistka_github_github_rest_api_client_rate_limit|gauge|The number of requests per hour the client is currently limited to|| |telefonistka_webhook_server_webhook_hits_total|counter|The total number of validated webhook hits|`parsing`| +|telefonistka_github_open_prs|gauge|The number of open PRs|`repo_slug`| +|telefonistka_github_open_promotion_prs|gauge|The number of open promotion PRs|`repo_slug`| +|telefonistka_github_open_prs_with_pending_telefonistka_checks|gauge|The number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)|`repo_slug`| +|telefonistka_github_commit_status_updates_total|counter|The total number of commit status updates, and their status (success/pending/failure)|`repo_slug`, `status`| + +> [!NOTE] +> telefonistka_github_*_prs metrics are only supported on installtions that uses GitHub App authentication as it provides an easy way to query the relevant GH repos. Example metrics snippet: @@ -26,4 +33,20 @@ telefonistka_github_github_rest_api_client_rate_remaining 99668 # HELP telefonistka_webhook_server_webhook_hits_total The total number of validated webhook hits # TYPE telefonistka_webhook_server_webhook_hits_total counter telefonistka_webhook_server_webhook_hits_total{parsing="successful"} 8 +# HELP telefonistka_github_commit_status_updates_total The total number of commit status updates, and their status (success/pending/failure) +# TYPE telefonistka_github_commit_status_updates_total counter +telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="error"} 1 +telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="pending"} 1 +# HELP telefonistka_github_open_promotion_prs The total number of open PRs with promotion label +# TYPE telefonistka_github_open_promotion_prs gauge +telefonistka_github_open_promotion_prs{repo_slug="foo/bar1"} 0 +telefonistka_github_open_promotion_prs{repo_slug="foo/bar2"} 10 +# HELP telefonistka_github_open_prs The total number of open PRs +# TYPE telefonistka_github_open_prs gauge +telefonistka_github_open_prs{repo_slug="foo/bar1"} 0 +telefonistka_github_open_prs{repo_slug="foo/bar2"} 21 +# HELP telefonistka_github_open_prs_with_pending_telefonistka_checks The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits) +# TYPE telefonistka_github_open_prs_with_pending_telefonistka_checks gauge +telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar1"} 0 +telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar2"} 0 ``` diff --git a/internal/pkg/githubapi/github.go b/internal/pkg/githubapi/github.go index 5f128c9..96bc94a 100644 --- a/internal/pkg/githubapi/github.go +++ b/internal/pkg/githubapi/github.go @@ -881,6 +881,8 @@ func SetCommitStatus(ghPrClientDetails GhPrClientDetails, state string) { _, resp, err := ghPrClientDetails.GhClientPair.v3Client.Repositories.CreateStatus(ctx, ghPrClientDetails.Owner, ghPrClientDetails.Repo, ghPrClientDetails.PrSHA, commitStatus) prom.InstrumentGhCall(resp) + repoSlug := ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo + prom.IncCommitStatusUpdateCounter(repoSlug, state) if err != nil { ghPrClientDetails.PrLogger.Errorf("Failed to set commit status: err=%s\n%v", err, resp) } diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go new file mode 100644 index 0000000..4b91952 --- /dev/null +++ b/internal/pkg/githubapi/pr_metrics.go @@ -0,0 +1,108 @@ +package githubapi + +import ( + "context" + "time" + + "github.com/google/go-github/v62/github" + lru "github.com/hashicorp/golang-lru/v2" + log "github.com/sirupsen/logrus" + prom "github.com/wayfair-incubator/telefonistka/internal/pkg/prometheus" +) + +const ( + timeToDefineStale = 20 * time.Minute + metricRefreshTime = 60 * time.Second +) + +func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) { + for t := range time.Tick(metricRefreshTime) { + log.Debugf("Updating pr metrics at %v", t) + getPrMetrics(mainGhClientCache) + } +} + +func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (pc prom.PrCounters, err error) { + log.Debugf("Checking repo %s", repo.GetName()) + ghOwner := repo.GetOwner().GetLogin() + prListOpts := &github.PullRequestListOptions{ + State: "open", + } + prs := []*github.PullRequest{} + + // paginate through PRs, there might be lots of them. + for { + perPagePrs, resp, err := ghClient.v3Client.PullRequests.List(ctx, ghOwner, repo.GetName(), prListOpts) + _ = prom.InstrumentGhCall(resp) + if err != nil { + log.Errorf("error getting PRs for %s/%s: %v", ghOwner, repo.GetName(), err) + } + prs = append(prs, perPagePrs...) + if resp.NextPage == 0 { + break + } + prListOpts.Page = resp.NextPage + } + + for _, pr := range prs { + if DoesPrHasLabel(pr.Labels, "promotion") { + pc.OpenPromotionPrs++ + } + + log.Debugf("Checking PR %d", pr.GetNumber()) + commitStatuses, resp, err := ghClient.v3Client.Repositories.GetCombinedStatus(ctx, ghOwner, repo.GetName(), pr.GetHead().GetSHA(), nil) + _ = prom.InstrumentGhCall(resp) + if err != nil { + log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err) + continue + } + if isPrStalePending(commitStatuses, timeToDefineStale) { + pc.PrWithStaleChecks++ + } + } + pc.OpenPrs = len(prs) + + return +} + +// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than timeToDefineStale and is in pending state +func isPrStalePending(commitStatuses *github.CombinedStatus, timeToDefineStale time.Duration) bool { + for _, status := range commitStatuses.Statuses { + if *status.Context == "telefonistka" && + *status.State == "pending" && + status.UpdatedAt.GetTime().Before(time.Now().Add(timeToDefineStale*-1)) { + log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State) + return true + } else { + log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State) + } + } + + return false +} + +// getPrMetrics iterates through all clients , gets all repos and then all PRs and calculates metrics +// getPrMetrics assumes Telefonsitka uses a GitHub App style of authentication as it uses the Apps.ListRepos call +// When using personal access token authentication, Telefonistka is unaware of the "relevant" repos (at least it get a webhook from them). +func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + for _, ghOwner := range mainGhClientCache.Keys() { + log.Debugf("Checking gh Owner %s", ghOwner) + ghClient, _ := mainGhClientCache.Get(ghOwner) + repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil) + _ = prom.InstrumentGhCall(resp) + if err != nil { + log.Errorf("error getting repos for %s: %v", ghOwner, err) + continue + } + for _, repo := range repos.Repositories { + pc, err := getRepoPrMetrics(ctx, ghClient, repo) + if err != nil { + log.Errorf("error getting repos for %s: %v", ghOwner, err) + continue + } + prom.PublishPrMetrics(pc, repo.GetFullName()) + } + } +} diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go new file mode 100644 index 0000000..f1a90d8 --- /dev/null +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -0,0 +1,89 @@ +package githubapi + +import ( + "testing" + "time" + + "github.com/google/go-github/v62/github" +) + +func TestIsPrStalePending(t *testing.T) { + t.Parallel() + timeToDefineStale := 15 * time.Minute + + currentTime := time.Now() + tests := map[string]struct { + input github.CombinedStatus + result bool + }{ + "All Success": { + input: github.CombinedStatus{ + Statuses: []*github.RepoStatus{ + { + State: github.String("success"), + Context: github.String("telefonistka"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-10 * time.Minute), + }, + }, + { + State: github.String("success"), + Context: github.String("circleci"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-10 * time.Minute), + }, + }, + { + State: github.String("success"), + Context: github.String("foobar"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-10 * time.Minute), + }, + }, + }, + }, + result: false, + }, + "Pending but not stale": { + input: github.CombinedStatus{ + Statuses: []*github.RepoStatus{ + { + State: github.String("pending"), + Context: github.String("telefonistka"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-1 * time.Minute), + }, + }, + }, + }, + result: false, + }, + + "Pending and stale": { + input: github.CombinedStatus{ + Statuses: []*github.RepoStatus{ + { + State: github.String("pending"), + Context: github.String("telefonistka"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-20 * time.Minute), + }, + }, + }, + }, + result: true, + }, + } + + for name, tc := range tests { + name := name + tc := tc + t.Run(name, func(t *testing.T) { + t.Parallel() + result := isPrStalePending(&tc.input, timeToDefineStale) + if result != tc.result { + t.Errorf("(%s)Expected %v, got %v", name, tc.result, result) + } + }) + } +} diff --git a/internal/pkg/prometheus/prometheus.go b/internal/pkg/prometheus/prometheus.go index 6056d49..be3e4ab 100644 --- a/internal/pkg/prometheus/prometheus.go +++ b/internal/pkg/prometheus/prometheus.go @@ -10,6 +10,12 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" ) +type PrCounters struct { + OpenPrs int + OpenPromotionPrs int + PrWithStaleChecks int +} + var ( webhookHitsVec = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "webhook_hits_total", @@ -39,6 +45,34 @@ var ( Subsystem: "github", }, []string{"api_group", "api_path", "repo_slug", "status", "method"}) + ghOpenPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "open_prs", + Help: "The total number of open PRs", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug"}) + + ghOpenPromotionPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "open_promotion_prs", + Help: "The total number of open PRs with promotion label", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug"}) + + ghOpenPrsWithPendingCheckGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "open_prs_with_pending_telefonistka_checks", + Help: "The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug"}) + + commitStatusUpdates = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "commit_status_updates_total", + Help: "The total number of commit status updates, and their status (success/pending/failure)", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug", "status"}) + whUpstreamRequestsCountVec = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "upstream_requests_total", Help: "The total number of requests forwarded upstream servers", @@ -47,6 +81,22 @@ var ( }, []string{"status", "method", "url"}) ) +func IncCommitStatusUpdateCounter(repoSlug string, status string) { + commitStatusUpdates.With(prometheus.Labels{ + "repo_slug": repoSlug, + "status": status, + }).Inc() +} + +func PublishPrMetrics(pc PrCounters, repoSlug string) { + metricLables := prometheus.Labels{ + "repo_slug": repoSlug, + } + ghOpenPrsGauge.With(metricLables).Set(float64(pc.OpenPrs)) + ghOpenPromotionPrsGauge.With(metricLables).Set(float64(pc.OpenPromotionPrs)) + ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(pc.PrWithStaleChecks)) +} + // This function instrument Webhook hits and parsing of their content func InstrumentWebhookHit(parsing_status string) { webhookHitsVec.With(prometheus.Labels{"parsing": parsing_status}).Inc()