From 0a1577644f249e09727271f8482ccedde4868c20 Mon Sep 17 00:00:00 2001
From: Todd Neal <tnealt@amazon.com>
Date: Tue, 11 Apr 2023 14:25:34 -0500
Subject: [PATCH] chore: add logging if deprovisioners take longer than
 expected (#288)

---
 pkg/controllers/deprovisioning/controller.go | 27 ++++++++++++++++++++
 pkg/controllers/provisioning/provisioner.go  |  1 +
 2 files changed, 28 insertions(+)

diff --git a/pkg/controllers/deprovisioning/controller.go b/pkg/controllers/deprovisioning/controller.go
index 54b72cbc4f..4618a209ca 100644
--- a/pkg/controllers/deprovisioning/controller.go
+++ b/pkg/controllers/deprovisioning/controller.go
@@ -53,6 +53,8 @@ type Controller struct {
 	clock          clock.Clock
 	cloudProvider  cloudprovider.CloudProvider
 	deprovisioners []Deprovisioner
+	mu             sync.Mutex
+	lastRun        map[string]time.Time
 }
 
 // pollingPeriod that we inspect cluster to look for opportunities to deprovision
@@ -81,6 +83,7 @@ func NewController(clk clock.Clock, kubeClient client.Client, provisioner *provi
 		provisioner:   provisioner,
 		recorder:      recorder,
 		cloudProvider: cp,
+		lastRun:       map[string]time.Time{},
 		deprovisioners: []Deprovisioner{
 			// Expire any machines that must be deleted, allowing their pods to potentially land on currently
 			NewExpiration(clk, kubeClient, cluster, provisioner, recorder),
@@ -107,15 +110,22 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Bu
 }
 
 func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
+	// this won't catch if the reconcile loop hangs forever but it will catch other issues
+	c.logAbnormalRuns(ctx)
+	defer c.logAbnormalRuns(ctx)
+	c.recordRun("deprovisioning-loop")
+
 	// We need to ensure that our internal cluster state mechanism is synced before we proceed
 	// with making any scheduling decision off of our state nodes. Otherwise, we have the potential to make
 	// a scheduling decision based on a smaller subset of nodes in our cluster state than actually exist.
 	if !c.cluster.Synced(ctx) {
+		logging.FromContext(ctx).Debugf("waiting on cluster sync")
 		return reconcile.Result{RequeueAfter: time.Second}, nil
 	}
 	// Attempt different deprovisioning methods. We'll only let one method perform an action
 	isConsolidated := c.cluster.Consolidated()
 	for _, d := range c.deprovisioners {
+		c.recordRun(fmt.Sprintf("%T", d))
 		success, err := c.deprovision(ctx, d)
 		if err != nil {
 			return reconcile.Result{}, fmt.Errorf("deprovisioning via %q, %w", d, err)
@@ -313,3 +323,20 @@ func (c *Controller) setNodesUnschedulable(ctx context.Context, isUnschedulable
 	}
 	return multiErr
 }
+
+func (c *Controller) recordRun(s string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.lastRun[s] = c.clock.Now()
+}
+
+func (c *Controller) logAbnormalRuns(ctx context.Context) {
+	const AbnormalTimeLimit = 15 * time.Minute
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	for name, runTime := range c.lastRun {
+		if timeSince := c.clock.Since(runTime); timeSince > AbnormalTimeLimit {
+			logging.FromContext(ctx).Debugf("abnormal time between runs of %s = %s", name, timeSince)
+		}
+	}
+}
diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go
index 772feb626d..08fa22c4ba 100644
--- a/pkg/controllers/provisioning/provisioner.go
+++ b/pkg/controllers/provisioning/provisioner.go
@@ -119,6 +119,7 @@ func (p *Provisioner) Reconcile(ctx context.Context, _ reconcile.Request) (resul
 	// with making any scheduling decision off of our state nodes. Otherwise, we have the potential to make
 	// a scheduling decision based on a smaller subset of nodes in our cluster state than actually exist.
 	if !p.cluster.Synced(ctx) {
+		logging.FromContext(ctx).Debugf("waiting on cluster sync")
 		return reconcile.Result{RequeueAfter: time.Second}, nil
 	}