Skip to content

Commit

Permalink
servstate: remove on-check-failure test races
Browse files Browse the repository at this point in the history
This patch improves the following tests, all related to checker
functionality in servstate observed when running under stress:

TestOnCheckFailureRestartWhileRunning
TestOnCheckFailureRestartDuringBackoff
TestOnCheckFailureIgnore
TestOnCheckFailureShutdown

Since we are unit testing servstate, and not checkstate, this patch
adds code to more effectively control (mock) when the check failure
actions are delivered during the test. This removes the requirement
for tests to rely on finely tuned delays to check various service
life-cycle states.

The following changes are made:

- Round some of the delays to multiples of 100ms to highlight the
  fact these values are less important now. Delays intended to
  present infinite are replaced with 10s to ensure severe failures
  are observed if the test does not work as intended.

- Remove iteration code to wait for a checker action, and replace
  with channel sync code.

- Add some additional function comments to explain what the tests
  aim to confirm.
  • Loading branch information
flotter committed Aug 2, 2023
1 parent 6991b3c commit 671d52b
Showing 1 changed file with 105 additions and 56 deletions.
161 changes: 105 additions & 56 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,11 @@ services:
})
}

// The aim of this test is to make sure that the actioned check
// failure terminates the service, after which it will first go
// to back-off state and then finally starts again (only once).
// Since the check always fail, it should only ever send an action
// once.
func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) {
s.setupDefaultServiceManager(c)
// Create check manager and tell it about plan updates
Expand All @@ -984,31 +989,40 @@ func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan int)
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- 1:
case <-time.After(10 * time.Second):
panic("this test is broken")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 1'
backoff-delay: 50ms
command: /bin/sh -c 'echo x >>%s; %s; sleep 10'
backoff-delay: 100ms
on-check-failure:
chk1: restart
checks:
chk1:
override: replace
period: 75ms # a bit longer than shortOkayDelay
period: 100ms
threshold: 1
exec:
command: will-fail
`, tempFile, s.insertDoneCheck(c, "test2")))
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
// Start service and wait till it starts up
s.startServices(c, []string{"test2"}, 1)

s.waitForDoneCheck(c, "test2")
Expand All @@ -1018,19 +1032,17 @@ checks:
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for check to fail")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp {
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
break
}
time.Sleep(5 * time.Millisecond)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting check failure to happen")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// Check failure should terminate process, backoff, and restart it, so wait for that
s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool {
Expand All @@ -1044,11 +1056,11 @@ checks:
c.Assert(string(b), Equals, "x\nx\n")

// Shouldn't be restarted again
time.Sleep(100 * time.Millisecond)
time.Sleep(125 * time.Millisecond)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\nx\n")
checks, err := checkMgr.Checks()
checks, err = checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
Expand All @@ -1058,6 +1070,11 @@ checks:
c.Assert(s.manager.BackoffNum("test2"), Equals, 1)
}

// The aim of this test is to make sure that the actioned check
// failure occurring during service back-off has no effect on
// on that service. The service is expected to restart by itself
// (due to back-off). Since the check always fail, it should only
// ever send an action once.
func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) {
s.setupDefaultServiceManager(c)
// Create check manager and tell it about plan updates
Expand All @@ -1066,16 +1083,25 @@ func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan int)
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- 1:
case <-time.After(10 * time.Second):
panic("this test is broken")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 0.075'
backoff-delay: 50ms
command: /bin/sh -c 'echo x >>%s; %s; sleep 0.1'
backoff-delay: 100ms
backoff-factor: 100 # ensure it only backoff-restarts once
on-check-failure:
chk1: restart
Expand All @@ -1091,7 +1117,7 @@ checks:
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
// Start service and wait till it starts up
s.startServices(c, []string{"test2"}, 1)

s.waitForDoneCheck(c, "test2")
Expand All @@ -1105,7 +1131,12 @@ checks:
return svc.Current == servstate.StatusBackoff
})

// Check failure should wait for current backoff (after which it will be restarted)
// Now wait till check happens (it will-fail)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting check failure to happen")
}

s.waitForDoneCheck(c, "test2")

Expand All @@ -1129,6 +1160,10 @@ checks:
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
}

// The aim of this test is to make sure that the actioned check
// failure is ignored, and as a result the service keeps on
// running. Since the check always fail, it should only ever
// send an action once.
func (s *S) TestOnCheckFailureIgnore(c *C) {
s.setupDefaultServiceManager(c)
// Create check manager and tell it about plan updates
Expand All @@ -1137,22 +1172,31 @@ func (s *S) TestOnCheckFailureIgnore(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan int)
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- 1:
case <-time.After(10 * time.Second):
panic("this test is broken")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 1'
command: /bin/sh -c 'echo x >>%s; %s; sleep 10'
on-check-failure:
chk1: ignore
checks:
chk1:
override: replace
period: 75ms # a bit longer than shortOkayDelay
period: 100ms
threshold: 1
exec:
command: will-fail
Expand All @@ -1170,26 +1214,24 @@ checks:
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for check to fail")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp {
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
break
}
time.Sleep(5 * time.Millisecond)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting check failure to happen")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// Service shouldn't have been restarted
time.Sleep(100 * time.Millisecond)
time.Sleep(125 * time.Millisecond)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")
checks, err := checkMgr.Checks()
checks, err = checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
Expand All @@ -1206,22 +1248,31 @@ func (s *S) TestOnCheckFailureShutdown(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan int)
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- 1:
case <-time.After(10 * time.Second):
panic("this test is broken")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 1'
command: /bin/sh -c 'echo x >>%s; %s; sleep 10'
on-check-failure:
chk1: shutdown
checks:
chk1:
override: replace
period: 75ms # a bit longer than shortOkayDelay
period: 100ms
threshold: 1
exec:
command: will-fail
Expand All @@ -1239,19 +1290,17 @@ checks:
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for check to fail")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp {
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
break
}
time.Sleep(5 * time.Millisecond)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting check failure to happen")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// It should have closed the stopDaemon channel.
select {
Expand Down

0 comments on commit 671d52b

Please sign in to comment.