Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

servstate: remove on-check-failure test races #272

Merged
merged 2 commits into from
Aug 2, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 105 additions & 56 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,11 @@ services:
})
}

// The aim of this test is to make sure that the actioned check
// failure terminates the service, after which it will first go
// to back-off state and then finally starts again (only once).
// Since the check always fail, it should only ever send an action
// once.
func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) {
s.setupDefaultServiceManager(c)
// Create check manager and tell it about plan updates
Expand All @@ -984,31 +989,40 @@ func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan struct{})
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- struct{}{}:
case <-time.After(10 * time.Second):
panic("timed out waiting to send on check-failed channel")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 1'
backoff-delay: 50ms
command: /bin/sh -c 'echo x >>%s; %s; sleep 10'
backoff-delay: 100ms
on-check-failure:
chk1: restart

checks:
chk1:
override: replace
period: 75ms # a bit longer than shortOkayDelay
period: 100ms
threshold: 1
exec:
command: will-fail
`, tempFile, s.insertDoneCheck(c, "test2")))
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
// Start service and wait till it starts up
s.startServices(c, []string{"test2"}, 1)

s.waitForDoneCheck(c, "test2")
Expand All @@ -1018,19 +1032,17 @@ checks:
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for check to fail")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp {
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
break
}
time.Sleep(5 * time.Millisecond)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting for check failure to arrive")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// Check failure should terminate process, backoff, and restart it, so wait for that
s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool {
Expand All @@ -1044,11 +1056,11 @@ checks:
c.Assert(string(b), Equals, "x\nx\n")

// Shouldn't be restarted again
time.Sleep(100 * time.Millisecond)
time.Sleep(125 * time.Millisecond)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\nx\n")
checks, err := checkMgr.Checks()
checks, err = checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
Expand All @@ -1058,6 +1070,11 @@ checks:
c.Assert(s.manager.BackoffNum("test2"), Equals, 1)
}

// The aim of this test is to make sure that the actioned check
// failure occurring during service back-off has no effect on
// on that service. The service is expected to restart by itself
// (due to back-off). Since the check always fail, it should only
// ever send an action once.
func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) {
s.setupDefaultServiceManager(c)
// Create check manager and tell it about plan updates
Expand All @@ -1066,16 +1083,25 @@ func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan struct{})
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- struct{}{}:
case <-time.After(10 * time.Second):
panic("timed out waiting to send on check-failed channel")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 0.075'
backoff-delay: 50ms
command: /bin/sh -c 'echo x >>%s; %s; sleep 0.1'
backoff-delay: 100ms
backoff-factor: 100 # ensure it only backoff-restarts once
on-check-failure:
chk1: restart
Expand All @@ -1091,7 +1117,7 @@ checks:
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
// Start service and wait till it starts up
s.startServices(c, []string{"test2"}, 1)

s.waitForDoneCheck(c, "test2")
Expand All @@ -1105,7 +1131,12 @@ checks:
return svc.Current == servstate.StatusBackoff
})

// Check failure should wait for current backoff (after which it will be restarted)
// Now wait till check happens (it will-fail)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting for check failure to arrive")
}

s.waitForDoneCheck(c, "test2")

Expand All @@ -1129,6 +1160,10 @@ checks:
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
}

// The aim of this test is to make sure that the actioned check
// failure is ignored, and as a result the service keeps on
// running. Since the check always fail, it should only ever
// send an action once.
func (s *S) TestOnCheckFailureIgnore(c *C) {
s.setupDefaultServiceManager(c)
// Create check manager and tell it about plan updates
Expand All @@ -1137,22 +1172,31 @@ func (s *S) TestOnCheckFailureIgnore(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan struct{})
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- struct{}{}:
case <-time.After(10 * time.Second):
panic("timed out waiting to send on check-failed channel")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 1'
command: /bin/sh -c 'echo x >>%s; %s; sleep 10'
on-check-failure:
chk1: ignore

checks:
chk1:
override: replace
period: 75ms # a bit longer than shortOkayDelay
period: 100ms
threshold: 1
exec:
command: will-fail
Expand All @@ -1170,26 +1214,24 @@ checks:
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for check to fail")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp {
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
break
}
time.Sleep(5 * time.Millisecond)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting for check failure to arrive")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// Service shouldn't have been restarted
time.Sleep(100 * time.Millisecond)
time.Sleep(125 * time.Millisecond)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")
checks, err := checkMgr.Checks()
checks, err = checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
Expand All @@ -1206,22 +1248,31 @@ func (s *S) TestOnCheckFailureShutdown(c *C) {
s.manager.NotifyPlanChanged(checkMgr.PlanChanged)

// Tell service manager about check failures
checkMgr.NotifyCheckFailed(s.manager.CheckFailed)
checkFailed := make(chan struct{})
checkMgr.NotifyCheckFailed(func(name string) {
// Control when the action should be applied
select {
case checkFailed <- struct{}{}:
case <-time.After(10 * time.Second):
panic("timed out waiting to send on check-failed channel")
}
s.manager.CheckFailed(name)
})

tempDir := c.MkDir()
tempFile := filepath.Join(tempDir, "out")
layer := parseLayer(c, 0, "layer", fmt.Sprintf(`
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; %s; sleep 1'
command: /bin/sh -c 'echo x >>%s; %s; sleep 10'
on-check-failure:
chk1: shutdown

checks:
chk1:
override: replace
period: 75ms # a bit longer than shortOkayDelay
period: 100ms
threshold: 1
exec:
command: will-fail
Expand All @@ -1239,19 +1290,17 @@ checks:
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for check to fail")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp {
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")
break
}
time.Sleep(5 * time.Millisecond)
select {
case <-checkFailed:
case <-time.After(10 * time.Second):
c.Fatalf("timed out waiting for check failure to arrive")
}
checks, err := checkMgr.Checks()
c.Assert(err, IsNil)
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].Failures, Equals, 1)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// It should have closed the stopDaemon channel.
select {
Expand Down
Loading