Skip to content

Commit

Permalink
servstate: remove test races on svcs file output
Browse files Browse the repository at this point in the history
Unit Tests for servstate is extremely timing sensitive. If you run the unit
tests under heavy CPU load, changes are very good you will start to see
various failures. The easiest way to demonstrate this is to use the stress
utility to simulate heavy CPU load (stress --io 12 --cpu 12).

Unit Test output snippet:

START: manager_test.go:828: S.TestEnvironment

... obtained string = ""
... expected string = "" +
...     "PEBBLE_ENV_TEST_1=foo\n" +
...     "PEBBLE_ENV_TEST_2=bar bazz\n" +
...     "PEBBLE_ENV_TEST_PARENT=from-parent\n"

FAIL: manager_test.go:828: S.TestEnvironment

This specific race condition is caused because unit tests often make
assumptions on how long it takes from the moment the service is started
until the moment the service command completes.

This PR will only address this very specific kind of race, and hopefully
in the process provide a solution that will be used in future tests.

Please note that using the stress utility on tests from this package will
still trigger lots of remaining races caused by very timing sensitive tests.
  • Loading branch information
flotter committed Jul 28, 2023
1 parent b57ddee commit c6e4f6d
Showing 1 changed file with 103 additions and 81 deletions.
184 changes: 103 additions & 81 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ type S struct {
manager *servstate.ServiceManager
runner *state.TaskRunner
stopDaemon chan struct{}

donePath string
}

var _ = Suite(&S{})
Expand Down Expand Up @@ -165,6 +167,10 @@ func (s *S) SetUpTest(c *C) {
s.AddCleanup(restore)
restore = servstate.FakeKillFailDelay(shortKillDelay, shortFailDelay)
s.AddCleanup(restore)

// Sync feature to ensure service action is complete. See
// waitForFileToAppear()
s.donePath = filepath.Join(c.MkDir(), "done")
}

func (s *S) TearDownTest(c *C) {
Expand Down Expand Up @@ -389,10 +395,10 @@ func (s *S) TestCurrentUserGroup(c *C) {
services:
usrtest:
override: replace
command: /bin/sh -c "id -n -u >%s; sleep %g"
command: /bin/sh -c "id -n -u >%s; sync; touch %s; sleep %g"
user: %s
group: %s
`, outputPath, shortOkayDelay.Seconds()+0.01, current.Username, group.Name))
`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01, current.Username, group.Name))
err = s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

Expand All @@ -401,6 +407,9 @@ services:
c.Assert(chg.Err(), IsNil)
s.st.Unlock()

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

output, err := ioutil.ReadFile(outputPath)
c.Assert(err, IsNil)
c.Check(string(output), Equals, current.Username+"\n")
Expand Down Expand Up @@ -825,11 +834,11 @@ func (s *S) TestEnvironment(c *C) {
services:
envtest:
override: replace
command: /bin/sh -c "env | grep PEBBLE_ENV_TEST | sort > %s; sleep 10"
command: /bin/sh -c "env | grep PEBBLE_ENV_TEST | sort > %s; sync; touch %s; sleep 10"
environment:
PEBBLE_ENV_TEST_1: foo
PEBBLE_ENV_TEST_2: bar bazz
`, logPath)
`, logPath, s.donePath)
layer := parseLayer(c, 0, "envlayer", layerYAML)
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)
Expand All @@ -848,6 +857,9 @@ services:
c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err()))
s.st.Unlock()

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

// Ensure it read environment variables correctly
data, err := ioutil.ReadFile(logPath)
if os.IsNotExist(err) {
Expand Down Expand Up @@ -989,7 +1001,7 @@ func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) {
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; sleep 1'
command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1'
backoff-delay: 50ms
on-check-failure:
chk1: restart
Expand All @@ -1001,22 +1013,19 @@ checks:
threshold: 1
exec:
command: will-fail
`, tempFile))
`, tempFile, s.donePath))
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
s.startServices(c, []string{"test2"}, 1)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for command to start")
}
b, _ := ioutil.ReadFile(tempFile)
if string(b) == "x\n" {
break
}
time.Sleep(5 * time.Millisecond)
}

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

b, err := ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
Expand All @@ -1037,21 +1046,17 @@ checks:
s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool {
return svc.Current == servstate.StatusBackoff
})
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for command to start")
}
b, err := ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
if string(b) == "x\nx\n" {
break
}
time.Sleep(5 * time.Millisecond)
}

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\nx\n")

// Shouldn't be restarted again
time.Sleep(100 * time.Millisecond)
b, err := ioutil.ReadFile(tempFile)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\nx\n")
checks, err := checkMgr.Checks()
Expand Down Expand Up @@ -1080,7 +1085,7 @@ func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) {
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; sleep 0.075'
command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 0.075'
backoff-delay: 50ms
backoff-factor: 100 # ensure it only backoff-restarts once
on-check-failure:
Expand All @@ -1093,47 +1098,41 @@ checks:
threshold: 1
exec:
command: will-fail
`, tempFile))
`, tempFile, s.donePath))
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
s.startServices(c, []string{"test2"}, 1)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for command to start")
}
b, _ := ioutil.ReadFile(tempFile)
if string(b) == "x\n" {
break
}
time.Sleep(5 * time.Millisecond)
}

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

b, err := ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")

// Ensure it exits and goes into backoff state
s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool {
return svc.Current == servstate.StatusBackoff
})

// Check failure should wait for current backoff (after which it will be restarted)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for command to start")
}
b, err := ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
if string(b) == "x\nx\n" {
break
}
time.Sleep(5 * time.Millisecond)
}

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\nx\n")

svc := s.serviceByName(c, "test2")
c.Assert(svc.Current, Equals, servstate.StatusActive)
c.Assert(s.manager.BackoffNum("test2"), Equals, 1)

// Shouldn't be restarted again
time.Sleep(125 * time.Millisecond)
b, err := ioutil.ReadFile(tempFile)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\nx\n")
checks, err := checkMgr.Checks()
Expand All @@ -1159,7 +1158,7 @@ func (s *S) TestOnCheckFailureIgnore(c *C) {
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; sleep 1'
command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1'
on-check-failure:
chk1: ignore
Expand All @@ -1170,22 +1169,19 @@ checks:
threshold: 1
exec:
command: will-fail
`, tempFile))
`, tempFile, s.donePath))
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
s.startServices(c, []string{"test2"}, 1)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for command to start")
}
b, _ := ioutil.ReadFile(tempFile)
if string(b) == "x\n" {
break
}
time.Sleep(5 * time.Millisecond)
}

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

b, err := ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
Expand All @@ -1204,7 +1200,7 @@ checks:

// Service shouldn't have been restarted
time.Sleep(100 * time.Millisecond)
b, err := ioutil.ReadFile(tempFile)
b, err = ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")
checks, err := checkMgr.Checks()
Expand Down Expand Up @@ -1232,7 +1228,7 @@ func (s *S) TestOnCheckFailureShutdown(c *C) {
services:
test2:
override: replace
command: /bin/sh -c 'echo x >>%s; sleep 1'
command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1'
on-check-failure:
chk1: shutdown
Expand All @@ -1243,22 +1239,19 @@ checks:
threshold: 1
exec:
command: will-fail
`, tempFile))
`, tempFile, s.donePath))
err := s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

// Start service and wait till it starts up (output file is written to)
s.startServices(c, []string{"test2"}, 1)
for i := 0; ; i++ {
if i >= 100 {
c.Fatalf("failed waiting for command to start")
}
b, _ := ioutil.ReadFile(tempFile)
if string(b) == "x\n" {
break
}
time.Sleep(5 * time.Millisecond)
}

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

b, err := ioutil.ReadFile(tempFile)
c.Assert(err, IsNil)
c.Assert(string(b), Equals, "x\n")

// Now wait till check happens (it will-fail)
for i := 0; ; i++ {
Expand Down Expand Up @@ -1587,8 +1580,8 @@ func (s *S) TestNoWorkingDir(c *C) {
services:
nowrkdir:
override: replace
command: /bin/sh -c "pwd >%s; sleep %g"
`, outputPath, shortOkayDelay.Seconds()+0.01))
command: /bin/sh -c "pwd >%s; sync; touch %s; sleep %g"
`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01))
err = s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

Expand All @@ -1599,6 +1592,9 @@ services:
c.Assert(chg.Err(), IsNil)
s.st.Unlock()

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

output, err := ioutil.ReadFile(outputPath)
c.Assert(err, IsNil)
c.Check(string(output), Matches, ".*/overlord/servstate\n")
Expand All @@ -1618,9 +1614,9 @@ func (s *S) TestWorkingDir(c *C) {
services:
wrkdir:
override: replace
command: /bin/sh -c "pwd >%s; sleep %g"
command: /bin/sh -c "pwd >%s; sync; touch %s; sleep %g"
working-dir: %s
`, outputPath, shortOkayDelay.Seconds()+0.01, dir))
`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01, dir))
err = s.manager.AppendLayer(layer)
c.Assert(err, IsNil)

Expand All @@ -1629,6 +1625,9 @@ services:
c.Assert(chg.Err(), IsNil)
s.st.Unlock()

// Let's wait until we know the output file is available
s.waitForFileToAppear(c)

output, err := ioutil.ReadFile(outputPath)
c.Assert(err, IsNil)
c.Check(string(output), Equals, dir+"\n")
Expand All @@ -1649,6 +1648,29 @@ func (s *S) defaultServiceManagerSetup(c *C) {
c.Assert(err, IsNil)
}

// Return on timeout or when the file appears. This is used to determine
// when the expected service output is actually available, not when the
// service starts to run.
func (s *S) waitForFileToAppear(c *C) {
timeout := time.After(time.Duration(time.Second * 10))
found:
for {
select {
case <-timeout:
c.Fatal("timeout waiting for service output")
default:
stat, err := os.Stat(s.donePath)
if err == nil && stat.Mode().IsRegular() {
// Delete it so we can reuse this feature
// in the same test.
os.Remove(s.donePath)
break found
}
time.Sleep(time.Millisecond)
}
}
}

// Make sure services are all stopped before the next test starts.
func (s *S) stopRunningServices(c *C) {
taskSet, err := servstate.StopRunning(s.st, s.manager)
Expand Down

0 comments on commit c6e4f6d

Please sign in to comment.