From 9100887459c7a32883d3cd15a727f19a48aa4ca3 Mon Sep 17 00:00:00 2001
From: Fred Lotter <fred.lotter@canonical.com>
Date: Tue, 27 Jun 2023 12:51:20 +0200
Subject: [PATCH 1/3] checkstate: make check runners exit cleanly

The checks manager uses contexts to cancel check runners. This happens
during checkstate.PlanChanged() when the previous checkers are all
terminated, and the new checker definition is applied.

However, context cancellation is asynchronous (as highlighted in the
context documentation), so the check runners will continue to complete
even after the context cancellations returned.

Due to this race condition, the runner can still launch a check even after
the context cancellation is complete.

This problem is reproduced here:
https://gist.github.com/flotter/f6bf32d7cc33cb1d53a3ddc39fc3b64c

The result of this is easily seen the following unit test failure:

panic: internal error: reaper must be started

goroutine 75 [running]:
github.com/canonical/pebble/internals/reaper.StartCommand
	/home/flotter/pebble/internals/reaper/reaper.go:164
github.com/canonical/pebble/internals/overlord/checkstate.(*execChecker).check()
	/home/flotter/pebble/internals/overlord/checkstate/checkers.go:170
github.com/canonical/pebble/internals/overlord/checkstate.(*checkData).runCheck()
	/home/flotter/pebble/internals/overlord/checkstate/manager.go:199
github.com/canonical/pebble/internals/overlord/checkstate.(*checkData).loop()
	/home/flotter/pebble/internals/overlord/checkstate/manager.go:182
created by github.com/canonical/pebble/internals/overlord/checkstate.(*CheckManager).PlanChanged
	/home/flotter/pebble/internals/overlord/checkstate/manager.go:74

In this particular case, the problem is that test teardown stops the reaper
process which is responsible for reaping check commands, while the final
(edge case) run of the check is still completing.

Following the same logic, this could also cause check overlap between terminating
previous checks and starting new checks in checkstate.PlanChanged().

The following changes fix the problem:

- Use a WaitGroup to represent the group of active checks in the check manager

- Wait() on the group to complete before creating the next group of checks
  when PlanChanged is called.

- Each check runner will use a deferred exit to decrement the group counter.
---
 internals/overlord/checkstate/manager.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/internals/overlord/checkstate/manager.go b/internals/overlord/checkstate/manager.go
index abbde8e7..af8db46f 100644
--- a/internals/overlord/checkstate/manager.go
+++ b/internals/overlord/checkstate/manager.go
@@ -27,6 +27,7 @@ import (
 // CheckManager starts and manages the health checks.
 type CheckManager struct {
 	mutex           sync.Mutex
+	group           sync.WaitGroup
 	checks          map[string]*checkData
 	failureHandlers []FailureFunc
 }
@@ -58,6 +59,12 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) {
 	for _, check := range m.checks {
 		check.cancel()
 	}
+	// Wait for all context cancellations to propagate and allow
+	// each goroutine to cleanly exit.
+	m.group.Wait()
+
+	// Set the size of the next wait group
+	m.group.Add(len(p.Checks))
 
 	// Then configure and start new checks.
 	checks := make(map[string]*checkData, len(p.Checks))
@@ -65,6 +72,7 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) {
 		ctx, cancel := context.WithCancel(context.Background())
 		check := &checkData{
 			config:  config,
+			group:   &m.group,
 			checker: newChecker(config),
 			ctx:     ctx,
 			cancel:  cancel,
@@ -155,6 +163,7 @@ const (
 // checkData holds state for an active health check.
 type checkData struct {
 	config  *plan.Check
+	group   *sync.WaitGroup
 	checker checker
 	ctx     context.Context
 	cancel  context.CancelFunc
@@ -171,6 +180,10 @@ type checker interface {
 }
 
 func (c *checkData) loop() {
+	// Schedule a notification on exit to indicate another
+	// checker in the group is complete.
+	defer c.group.Done()
+
 	logger.Debugf("Check %q starting with period %v", c.config.Name, c.config.Period.Value)
 
 	ticker := time.NewTicker(c.config.Period.Value)

From d36ce7f7e6addd5ad302922a9279ef7d5051acb9 Mon Sep 17 00:00:00 2001
From: Fred Lotter <fred.lotter@canonical.com>
Date: Tue, 27 Jun 2023 12:52:00 +0200
Subject: [PATCH 2/3] checkstate: make tests more robust

The following changes are made:

- Remove a check which depends on check output in TestTimeout as
  there is no way to remove the race between the 'echo FOO'
  completing before the timeout is triggered.

  This failure can easily be reproduced by running the test under
  heavy cpu load (i.e. stress --cpu 10 --io 4)

  manager_test.go:140:
      c.Assert(check.ErrorDetails, Equals, "FOO")
  ... obtained string = ""
  ... expected string = "FOO"

- Make waitCheck() more conservative when waiting for a check
  runner iteration to complete. The worse case timeout can
  safely be much longer as this does not have a general impact
  on test duration.

  This failure can easily be reproduced by running the test under
  heavy cpu load (i.e. stress --cpu 10 --io 4)

  manager_test.go:134:
      check := waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool {
          return check.Status != CheckStatusUp
      })
  manager_test.go:283:
      c.Fatalf("timed out waiting for check %q", name)
  ... Error: timed out waiting for check "chk1"
---
 internals/overlord/checkstate/manager_test.go | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/internals/overlord/checkstate/manager_test.go b/internals/overlord/checkstate/manager_test.go
index 62ee24ab..7388aa60 100644
--- a/internals/overlord/checkstate/manager_test.go
+++ b/internals/overlord/checkstate/manager_test.go
@@ -45,12 +45,14 @@ func (s *ManagerSuite) SetUpSuite(c *C) {
 	setLoggerOnce.Do(func() {
 		logger.SetLogger(logger.New(os.Stderr, "[test] "))
 	})
+}
 
+func (s *ManagerSuite) SetUpTest(c *C) {
 	err := reaper.Start()
 	c.Assert(err, IsNil)
 }
 
-func (s *ManagerSuite) TearDownSuite(c *C) {
+func (s *ManagerSuite) TearDownTest(c *C) {
 	err := reaper.Stop()
 	c.Assert(err, IsNil)
 }
@@ -137,7 +139,6 @@ func (s *ManagerSuite) TestTimeout(c *C) {
 	c.Assert(check.Failures, Equals, 1)
 	c.Assert(check.Threshold, Equals, 1)
 	c.Assert(check.LastError, Equals, "exec check timed out")
-	c.Assert(check.ErrorDetails, Equals, "FOO")
 }
 
 func (s *ManagerSuite) TestCheckCanceled(c *C) {
@@ -161,17 +162,15 @@ func (s *ManagerSuite) TestCheckCanceled(c *C) {
 		},
 	})
 
-	// Wait for command to start (output file grows in size)
-	prevSize := 0
+	// Wait for command to start (output file is not zero in size)
 	for i := 0; ; i++ {
 		if i >= 100 {
 			c.Fatalf("failed waiting for command to start")
 		}
 		b, _ := ioutil.ReadFile(tempFile)
-		if len(b) != prevSize {
+		if len(b) > 0 {
 			break
 		}
-		prevSize = len(b)
 		time.Sleep(time.Millisecond)
 	}
 
@@ -185,7 +184,6 @@ func (s *ManagerSuite) TestCheckCanceled(c *C) {
 	stopChecks(c, mgr)
 
 	// Ensure command was terminated (output file didn't grow in size)
-	time.Sleep(50 * time.Millisecond)
 	b1, err := ioutil.ReadFile(tempFile)
 	c.Assert(err, IsNil)
 	time.Sleep(20 * time.Millisecond)
@@ -269,8 +267,20 @@ func (s *ManagerSuite) TestFailures(c *C) {
 	c.Assert(failureName, Equals, "")
 }
 
+// waitCheck is a time based approach to wait for a checker run to complete.
+// The timeout value does not impact the general time it takes for tests to
+// complete, but determines a worse case waiting period before giving up.
+// The timeout value must take into account single core or very busy machines
+// so it makes sense to pick a conservative number here as failing a test
+// due to a busy test resource is more extensive than waiting a few more
+// seconds.
 func waitCheck(c *C, mgr *CheckManager, name string, f func(check *CheckInfo) bool) *CheckInfo {
-	for i := 0; i < 100; i++ {
+	// Worse case waiting time for checker run(s) to complete. This
+	// period should be much longer (10x is good) than the longest
+	// check timeout value.
+	timeout := time.Second * 10
+
+	for start := time.Now(); time.Since(start) < timeout; {
 		checks, err := mgr.Checks()
 		c.Assert(err, IsNil)
 		for _, check := range checks {
@@ -280,6 +290,7 @@ func waitCheck(c *C, mgr *CheckManager, name string, f func(check *CheckInfo) bo
 		}
 		time.Sleep(time.Millisecond)
 	}
+
 	c.Fatalf("timed out waiting for check %q", name)
 	return nil
 }

From c853715be13aa0eb985a2ebf2af7f717da691389 Mon Sep 17 00:00:00 2001
From: Fred Lotter <fred.lotter@canonical.com>
Date: Wed, 28 Jun 2023 12:09:42 +0200
Subject: [PATCH 3/3] Code review changes #1

---
 internals/overlord/checkstate/manager.go      | 17 +++++++----------
 internals/overlord/checkstate/manager_test.go |  4 ++--
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/internals/overlord/checkstate/manager.go b/internals/overlord/checkstate/manager.go
index af8db46f..6cba21ca 100644
--- a/internals/overlord/checkstate/manager.go
+++ b/internals/overlord/checkstate/manager.go
@@ -27,7 +27,7 @@ import (
 // CheckManager starts and manages the health checks.
 type CheckManager struct {
 	mutex           sync.Mutex
-	group           sync.WaitGroup
+	wg              sync.WaitGroup
 	checks          map[string]*checkData
 	failureHandlers []FailureFunc
 }
@@ -61,10 +61,10 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) {
 	}
 	// Wait for all context cancellations to propagate and allow
 	// each goroutine to cleanly exit.
-	m.group.Wait()
+	m.wg.Wait()
 
 	// Set the size of the next wait group
-	m.group.Add(len(p.Checks))
+	m.wg.Add(len(p.Checks))
 
 	// Then configure and start new checks.
 	checks := make(map[string]*checkData, len(p.Checks))
@@ -72,14 +72,16 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) {
 		ctx, cancel := context.WithCancel(context.Background())
 		check := &checkData{
 			config:  config,
-			group:   &m.group,
 			checker: newChecker(config),
 			ctx:     ctx,
 			cancel:  cancel,
 			action:  m.callFailureHandlers,
 		}
 		checks[name] = check
-		go check.loop()
+		go func() {
+			defer m.wg.Done()
+			check.loop()
+		}()
 	}
 	m.checks = checks
 }
@@ -163,7 +165,6 @@ const (
 // checkData holds state for an active health check.
 type checkData struct {
 	config  *plan.Check
-	group   *sync.WaitGroup
 	checker checker
 	ctx     context.Context
 	cancel  context.CancelFunc
@@ -180,10 +181,6 @@ type checker interface {
 }
 
 func (c *checkData) loop() {
-	// Schedule a notification on exit to indicate another
-	// checker in the group is complete.
-	defer c.group.Done()
-
 	logger.Debugf("Check %q starting with period %v", c.config.Name, c.config.Period.Value)
 
 	ticker := time.NewTicker(c.config.Period.Value)
diff --git a/internals/overlord/checkstate/manager_test.go b/internals/overlord/checkstate/manager_test.go
index 7388aa60..12b418b1 100644
--- a/internals/overlord/checkstate/manager_test.go
+++ b/internals/overlord/checkstate/manager_test.go
@@ -269,13 +269,13 @@ func (s *ManagerSuite) TestFailures(c *C) {
 
 // waitCheck is a time based approach to wait for a checker run to complete.
 // The timeout value does not impact the general time it takes for tests to
-// complete, but determines a worse case waiting period before giving up.
+// complete, but determines a worst case waiting period before giving up.
 // The timeout value must take into account single core or very busy machines
 // so it makes sense to pick a conservative number here as failing a test
 // due to a busy test resource is more extensive than waiting a few more
 // seconds.
 func waitCheck(c *C, mgr *CheckManager, name string, f func(check *CheckInfo) bool) *CheckInfo {
-	// Worse case waiting time for checker run(s) to complete. This
+	// Worst case waiting time for checker run(s) to complete. This
 	// period should be much longer (10x is good) than the longest
 	// check timeout value.
 	timeout := time.Second * 10