From 2c66795b865309766d78a7bd87cdefa13f1d3efc Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Tue, 24 Sep 2024 19:48:04 +0800 Subject: [PATCH 01/11] poc: allow stopping services that are starting and restart services that quickly exist within the 1s okay delay --- internals/overlord/servstate/handlers.go | 16 ++++++++++++++++ internals/overlord/servstate/manager.go | 2 +- tests/run_test.go | 6 +----- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/internals/overlord/servstate/handlers.go b/internals/overlord/servstate/handlers.go index 2bcdd6945..937c4cbbc 100644 --- a/internals/overlord/servstate/handlers.go +++ b/internals/overlord/servstate/handlers.go @@ -512,6 +512,12 @@ func (s *serviceData) exited(exitCode int) error { case stateStarting: s.started <- fmt.Errorf("exited quickly with code %d", exitCode) s.transition(stateExited) // not strictly necessary as doStart will return, but doesn't hurt + action, onType := getAction(s.config, exitCode == 0) + switch action { + case plan.ActionRestart: + s.doBackoff(action, onType) + default: + } case stateRunning: logger.Noticef("Service %q stopped unexpectedly with code %d", s.config.Name, exitCode) @@ -682,6 +688,16 @@ func (s *serviceData) stop() error { defer s.manager.servicesLock.Unlock() switch s.state { + case stateStarting: + s.started <- fmt.Errorf("stopped before the 1 second okay delay") + logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name) + err := syscall.Kill(-s.cmd.Process.Pid, syscall.SIGTERM) + if err != nil { + logger.Noticef("Cannot send SIGTERM to process: %v", err) + } + s.transition(stateTerminating) + time.AfterFunc(s.killDelay(), func() { logError(s.terminateTimeElapsed()) }) + case stateRunning: logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name) // First send SIGTERM to try to terminate it gracefully. diff --git a/internals/overlord/servstate/manager.go b/internals/overlord/servstate/manager.go index 43a379d27..7b319aab1 100644 --- a/internals/overlord/servstate/manager.go +++ b/internals/overlord/servstate/manager.go @@ -375,7 +375,7 @@ func servicesToStop(m *ServiceManager) ([][]string, error) { var notStopped []string for _, name := range services { s := m.services[name] - if s != nil && (s.state == stateRunning || s.state == stateBackoff) { + if s != nil && (s.state == stateStarting || s.state == stateRunning || s.state == stateBackoff) { notStopped = append(notStopped, name) } } diff --git a/tests/run_test.go b/tests/run_test.go index c98c6937e..d9b0073e4 100644 --- a/tests/run_test.go +++ b/tests/run_test.go @@ -49,9 +49,7 @@ services: createLayer(t, pebbleDir, "001-simple-layer.yaml", layerYAML) - _, stderrCh := pebbleRun(t, pebbleDir) - waitForLog(t, stderrCh, "pebble", "Started default services", 3*time.Second) - + _, _ = pebbleRun(t, pebbleDir) waitForFile(t, filepath.Join(pebbleDir, "svc1"), 3*time.Second) waitForFile(t, filepath.Join(pebbleDir, "svc2"), 3*time.Second) } @@ -142,7 +140,6 @@ services: stdoutCh, stderrCh := pebbleRun(t, pebbleDir, "--verbose") waitForLog(t, stderrCh, "pebble", "Started daemon", 3*time.Second) waitForLog(t, stdoutCh, "svc1", "hello world", 3*time.Second) - waitForLog(t, stderrCh, "pebble", "Started default services", 3*time.Second) } // TestArgs tests that Pebble provides additional arguments to a service @@ -168,7 +165,6 @@ services: ) waitForLog(t, stderrCh, "pebble", "Started daemon", 3*time.Second) waitForLog(t, stdoutCh, "svc1", "hello world", 3*time.Second) - waitForLog(t, stderrCh, "pebble", "Started default services", 3*time.Second) } // TestIdentities tests that Pebble seeds identities from a file From 649ebbd5a6b884c27f9b592423e304b8cf0d8e45 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Thu, 26 Sep 2024 13:45:12 +0800 Subject: [PATCH 02/11] chore: remove fix for another issue and use fallthrough --- internals/overlord/servstate/handlers.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/internals/overlord/servstate/handlers.go b/internals/overlord/servstate/handlers.go index 937c4cbbc..c14227124 100644 --- a/internals/overlord/servstate/handlers.go +++ b/internals/overlord/servstate/handlers.go @@ -512,12 +512,6 @@ func (s *serviceData) exited(exitCode int) error { case stateStarting: s.started <- fmt.Errorf("exited quickly with code %d", exitCode) s.transition(stateExited) // not strictly necessary as doStart will return, but doesn't hurt - action, onType := getAction(s.config, exitCode == 0) - switch action { - case plan.ActionRestart: - s.doBackoff(action, onType) - default: - } case stateRunning: logger.Noticef("Service %q stopped unexpectedly with code %d", s.config.Name, exitCode) @@ -690,13 +684,7 @@ func (s *serviceData) stop() error { switch s.state { case stateStarting: s.started <- fmt.Errorf("stopped before the 1 second okay delay") - logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name) - err := syscall.Kill(-s.cmd.Process.Pid, syscall.SIGTERM) - if err != nil { - logger.Noticef("Cannot send SIGTERM to process: %v", err) - } - s.transition(stateTerminating) - time.AfterFunc(s.killDelay(), func() { logError(s.terminateTimeElapsed()) }) + fallthrough case stateRunning: logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name) From cf0c4dcc0432a3e86651cb27522e69bfd2b9398a Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Thu, 26 Sep 2024 13:52:50 +0800 Subject: [PATCH 03/11] chore: update state transfer graph --- .../overlord/servstate/state-diagram.dot | 1 + .../overlord/servstate/state-diagram.svg | 248 +++++++++--------- 2 files changed, 129 insertions(+), 120 deletions(-) diff --git a/internals/overlord/servstate/state-diagram.dot b/internals/overlord/servstate/state-diagram.dot index 7954864dc..e3da5c620 100644 --- a/internals/overlord/servstate/state-diagram.dot +++ b/internals/overlord/servstate/state-diagram.dot @@ -7,6 +7,7 @@ digraph service_state_machine { node [penwidth=1] initial -> starting [label="start"] starting -> running [label="okay wait\nelapsed"] + starting -> terminating [label="stopped before\nokay wait elapses"] running -> terminating [label="stop"] running -> terminating [label="check failed\n(action \"restart\")"] terminating -> killing [label="terminate time\nelapsed"] diff --git a/internals/overlord/servstate/state-diagram.svg b/internals/overlord/servstate/state-diagram.svg index 4dd2f2bd3..60dd86ffa 100644 --- a/internals/overlord/servstate/state-diagram.svg +++ b/internals/overlord/servstate/state-diagram.svg @@ -1,221 +1,229 @@ - - - + + service_state_machine - + initial - -initial + +initial starting - -starting + +starting initial->starting - - -start + + +start running - -running + +running starting->running - - -okay wait -elapsed + + +okay wait +elapsed + + + +terminating + +terminating + + + +starting->terminating + + +stopped before +okay wait elapses exited - -exited + +exited - + starting->exited - - -exited - - - -terminating - -terminating + + +exited - + running->terminating - - -stop + + +stop - + running->terminating - - -check failed -(action "restart") + + +check failed +(action "restart") backoff - -backoff + +backoff - + running->backoff - - -exited -(action "restart") + + +exited +(action "restart") - + running->exited - - -exited -(action "ignore") + + +exited +(action "ignore") - + running->exited - - -exited -(action "shutdown") + + +exited +(action "shutdown") killing - -killing + +killing - + terminating->killing - - -terminate time -elapsed + + +terminate time +elapsed stopped - -stopped + +stopped - + terminating->stopped - - -exited -(not restarting) + + +exited +(not restarting) - + terminating->backoff - - -exited -(restarting) + + +exited +(restarting) - + killing->stopped - - -exited -(not restarting) + + +exited +(not restarting) - + killing->stopped - - -kill time -elapsed + + +kill time +elapsed - + killing->backoff - - -exited -(restarting) + + +exited +(restarting) - + stopped->starting - - -start + + +start - + backoff->starting - - -start + + +start - + backoff->running - - -backoff time -elapsed + + +backoff time +elapsed - + backoff->stopped - - -stop + + +stop - + exited->starting - - -start + + +start - + exited->stopped - - -stop + + +stop - + exited->backoff - - -check failed -(action "restart") + + +check failed +(action "restart") From 326204a3fec4ac45e0a6854ee4fe3077583ed428 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Fri, 27 Sep 2024 09:24:45 +0800 Subject: [PATCH 04/11] Update internals/overlord/servstate/handlers.go Co-authored-by: Ben Hoyt --- internals/overlord/servstate/handlers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internals/overlord/servstate/handlers.go b/internals/overlord/servstate/handlers.go index c14227124..24172e0c7 100644 --- a/internals/overlord/servstate/handlers.go +++ b/internals/overlord/servstate/handlers.go @@ -683,7 +683,7 @@ func (s *serviceData) stop() error { switch s.state { case stateStarting: - s.started <- fmt.Errorf("stopped before the 1 second okay delay") + s.started <- fmt.Errorf("stopped before the %s okay delay", okayDelay) fallthrough case stateRunning: From ec0b876195102d0e4334df2043387bcd9fc8b247 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Fri, 27 Sep 2024 09:24:50 +0800 Subject: [PATCH 05/11] Update internals/overlord/servstate/state-diagram.dot Co-authored-by: Ben Hoyt --- internals/overlord/servstate/state-diagram.dot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internals/overlord/servstate/state-diagram.dot b/internals/overlord/servstate/state-diagram.dot index e3da5c620..63f939142 100644 --- a/internals/overlord/servstate/state-diagram.dot +++ b/internals/overlord/servstate/state-diagram.dot @@ -7,7 +7,7 @@ digraph service_state_machine { node [penwidth=1] initial -> starting [label="start"] starting -> running [label="okay wait\nelapsed"] - starting -> terminating [label="stopped before\nokay wait elapses"] + starting -> terminating [label="stop (before\nokay wait elapses)"] running -> terminating [label="stop"] running -> terminating [label="check failed\n(action \"restart\")"] terminating -> killing [label="terminate time\nelapsed"] From 4927068c02e16b97c0374cece61badbd9ad9c06a Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Fri, 27 Sep 2024 09:25:35 +0800 Subject: [PATCH 06/11] chore: uddate state diagram --- internals/overlord/servstate/state-diagram.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internals/overlord/servstate/state-diagram.svg b/internals/overlord/servstate/state-diagram.svg index 60dd86ffa..44795c88b 100644 --- a/internals/overlord/servstate/state-diagram.svg +++ b/internals/overlord/servstate/state-diagram.svg @@ -51,10 +51,10 @@ starting->terminating - + -stopped before -okay wait elapses +stop (before +okay wait elapses) From 28d94de9ce676154883900831ba4c5024751958a Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Fri, 27 Sep 2024 19:06:17 +0800 Subject: [PATCH 07/11] test: add tests for stopping services within the okay delay --- internals/daemon/daemon_test.go | 65 +++++++++++++++++++- internals/overlord/servstate/manager_test.go | 46 ++++++++++++++ 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go index 6751ead9f..8bd624785 100644 --- a/internals/daemon/daemon_test.go +++ b/internals/daemon/daemon_test.go @@ -1148,7 +1148,7 @@ services: // We have to wait for it be in running state. for i := 0; ; i++ { if i >= 25 { - c.Fatalf("timed out waiting or service to start") + c.Fatalf("timed out waiting for service to start") } d.state.Lock() change := d.state.Change(rsp.Change) @@ -1283,7 +1283,7 @@ services: // We have to wait for it be in running state for StopRunning to stop it. for i := 0; ; i++ { if i >= 25 { - c.Fatalf("timed out waiting or service to start") + c.Fatalf("timed out waiting for service to start") } d.state.Lock() change := d.state.Change(rsp.Change) @@ -1317,6 +1317,67 @@ services: c.Check(tasks[0].Kind(), Equals, "stop") } +func (s *daemonSuite) TestStopWithinOkayDelay(c *C) { + // Start the daemon. + writeTestLayer(s.pebbleDir, ` +services: + test1: + override: replace + command: sleep 10 +`) + d := s.newDaemon(c) + err := d.Init() + c.Assert(err, IsNil) + c.Assert(d.Start(), IsNil) + + // Start the test service. + payload := bytes.NewBufferString(`{"action": "start", "services": ["test1"]}`) + req, err := http.NewRequest("POST", "/v1/services", payload) + c.Assert(err, IsNil) + rsp := v1PostServices(apiCmd("/v1/services"), req, nil).(*resp) + rec := httptest.NewRecorder() + rsp.ServeHTTP(rec, req) + c.Check(rec.Result().StatusCode, Equals, 202) + + // Wait for the change to be in doing state. + for i := 0; ; i++ { + if i >= 10 { + c.Fatalf("timed out waiting for change") + } + d.state.Lock() + change := d.state.Change(rsp.Change) + changeStatus := change.Status() + d.state.Unlock() + if change != nil && changeStatus == state.DoingStatus { + break + } + time.Sleep(20 * time.Millisecond) + } + + // Stop the daemon within the okayDelay, + // which should stop services in running state. + err = d.Stop(nil) + c.Assert(err, IsNil) + + // Ensure the "stop" change was created, along with its "stop" tasks. + d.state.Lock() + defer d.state.Unlock() + changes := d.state.Changes() + var change *state.Change + for _, ch := range changes { + if ch.Kind() == "stop" { + change = ch + } + } + if change == nil { + c.Fatalf("stop change not found") + } + c.Check(change.Status(), Equals, state.DoneStatus) + tasks := change.Tasks() + c.Assert(tasks, HasLen, 1) + c.Check(tasks[0].Kind(), Equals, "stop") +} + func (s *daemonSuite) TestWritesRequireAdminAccess(c *C) { for _, cmd := range API { if cmd.Path == "/v1/notices" { diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index c097336ca..98fbf85c0 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -297,6 +297,52 @@ services: c.Assert(s.manager.StopTimeout(), Equals, time.Minute*60+time.Millisecond*200) } +func (s *S) TestStopServiceWithinOkayDelay(c *C) { + // A longer okayDelay is used so that the change for starting the services won't + // quickly transition into the running state. + fakeOkayDelay := 5 * shortOkayDelay + servstate.FakeOkayWait(fakeOkayDelay) + + s.newServiceManager(c) + layer := ` +services: + %s: + override: replace + command: /bin/sh -c "sleep %g; {{.NotifyDoneCheck}}" +` + serviceName := "test-stop-within-okaywait" + s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay)) + s.planChanged(c) + + // Start the service without waiting for change ready. + s.st.Lock() + ts, err := servstate.Start(s.st, [][]string{{serviceName}}) + c.Check(err, IsNil) + chgStart := s.st.NewChange("test", "Start test") + chgStart.AddAll(ts) + s.st.Unlock() + s.runner.Ensure() + + // Stop the service immediately within okayDelay + chg := s.stopServices(c, [][]string{{serviceName}}) + s.st.Lock() + c.Assert(chg.Err(), IsNil) + s.st.Unlock() + + waitChangeReady(c, s.runner, chgStart, "Start test") + + s.st.Lock() + c.Check(chgStart.Status(), Equals, state.ErrorStatus) + c.Check(chgStart.Err(), ErrorMatches, `(?s).*cannot start service: exited quickly with code.*`) + s.st.Unlock() + + donePath := filepath.Join(s.dir, serviceName) + // DonePath not created means the service is terminated within the okayWait. + if _, err := os.Stat(donePath); err == nil { + c.Fatalf("service %s waiting for service output", serviceName) + } +} + func (s *S) TestKillDelayIsUsed(c *C) { s.newServiceManager(c) s.planAddLayer(c, testPlanLayer) From b46ca80b623327200df6e4faee121229e6008db5 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Sun, 29 Sep 2024 11:01:38 +0800 Subject: [PATCH 08/11] chore: update test cases with comments --- internals/daemon/daemon_test.go | 7 +++---- internals/overlord/servstate/manager_test.go | 5 +++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go index 8bd624785..a73cceb0a 100644 --- a/internals/daemon/daemon_test.go +++ b/internals/daemon/daemon_test.go @@ -1339,7 +1339,7 @@ services: rsp.ServeHTTP(rec, req) c.Check(rec.Result().StatusCode, Equals, 202) - // Wait for the change to be in doing state. + // Wait for the change to be in doing state so that the service is in starting state. for i := 0; ; i++ { if i >= 10 { c.Fatalf("timed out waiting for change") @@ -1354,12 +1354,11 @@ services: time.Sleep(20 * time.Millisecond) } - // Stop the daemon within the okayDelay, - // which should stop services in running state. + // Stop the daemon within the okayDelay, which should stop services in starting state. err = d.Stop(nil) c.Assert(err, IsNil) - // Ensure the "stop" change was created, along with its "stop" tasks. + // Ensure a "stop" change is created, along with its "stop" tasks. d.state.Lock() defer d.state.Unlock() changes := d.state.Changes() diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index 98fbf85c0..2f95ae374 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -298,12 +298,13 @@ services: } func (s *S) TestStopServiceWithinOkayDelay(c *C) { - // A longer okayDelay is used so that the change for starting the services won't + // A longer okayDelay is used so that the change for starting the service won't // quickly transition into the running state. fakeOkayDelay := 5 * shortOkayDelay servstate.FakeOkayWait(fakeOkayDelay) s.newServiceManager(c) + // The service sleeps for fakeOkayDelay second then creates a side effect (a file at donePath). layer := ` services: %s: @@ -337,7 +338,7 @@ services: s.st.Unlock() donePath := filepath.Join(s.dir, serviceName) - // DonePath not created means the service is terminated within the okayWait. + // If the service is stopped within okayDelay and is indeed terminated, donePath should not exist. if _, err := os.Stat(donePath); err == nil { c.Fatalf("service %s waiting for service output", serviceName) } From 1132f735686b1c87452ae1d3ccebc2e6c1e93583 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Tue, 8 Oct 2024 20:31:02 +0800 Subject: [PATCH 09/11] chore: refactor after code review and fix erroneous test case --- internals/daemon/daemon_test.go | 9 ++++++--- internals/overlord/servstate/manager_test.go | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go index a73cceb0a..93b34949f 100644 --- a/internals/daemon/daemon_test.go +++ b/internals/daemon/daemon_test.go @@ -1341,14 +1341,17 @@ services: // Wait for the change to be in doing state so that the service is in starting state. for i := 0; ; i++ { - if i >= 10 { + if i >= 45 { c.Fatalf("timed out waiting for change") } d.state.Lock() change := d.state.Change(rsp.Change) - changeStatus := change.Status() + var status state.Status + if change != nil { + status = change.Status() + } d.state.Unlock() - if change != nil && changeStatus == state.DoingStatus { + if status == state.DoingStatus { break } time.Sleep(20 * time.Millisecond) diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index 2f95ae374..3a8c9d30d 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -300,10 +300,11 @@ services: func (s *S) TestStopServiceWithinOkayDelay(c *C) { // A longer okayDelay is used so that the change for starting the service won't // quickly transition into the running state. - fakeOkayDelay := 5 * shortOkayDelay + fakeOkayDelay := 20 * shortOkayDelay servstate.FakeOkayWait(fakeOkayDelay) s.newServiceManager(c) + serviceName := "test-stop-within-okaywait" // The service sleeps for fakeOkayDelay second then creates a side effect (a file at donePath). layer := ` services: @@ -311,8 +312,7 @@ services: override: replace command: /bin/sh -c "sleep %g; {{.NotifyDoneCheck}}" ` - serviceName := "test-stop-within-okaywait" - s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay)) + s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay.Seconds())) s.planChanged(c) // Start the service without waiting for change ready. @@ -324,7 +324,12 @@ services: s.st.Unlock() s.runner.Ensure() - // Stop the service immediately within okayDelay + // Wait until the service is in the starting state. + s.waitUntilService(c, serviceName, func(service *servstate.ServiceInfo) bool { + return service.Current == servstate.StatusActive + }) + + // Stop the service within okayDelay. chg := s.stopServices(c, [][]string{{serviceName}}) s.st.Lock() c.Assert(chg.Err(), IsNil) @@ -334,7 +339,7 @@ services: s.st.Lock() c.Check(chgStart.Status(), Equals, state.ErrorStatus) - c.Check(chgStart.Err(), ErrorMatches, `(?s).*cannot start service: exited quickly with code.*`) + c.Check(chgStart.Err(), ErrorMatches, fmt.Sprintf(`(?s).*stopped before the %s okay delay.*`, fakeOkayDelay)) s.st.Unlock() donePath := filepath.Join(s.dir, serviceName) From 13a33cbe6d71d6cdb994750baf5eda47dddc790d Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Tue, 8 Oct 2024 20:38:58 +0800 Subject: [PATCH 10/11] chore: update test case --- internals/daemon/daemon_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go index 93b34949f..045aa1b2b 100644 --- a/internals/daemon/daemon_test.go +++ b/internals/daemon/daemon_test.go @@ -1341,7 +1341,7 @@ services: // Wait for the change to be in doing state so that the service is in starting state. for i := 0; ; i++ { - if i >= 45 { + if i >= 18 { c.Fatalf("timed out waiting for change") } d.state.Lock() @@ -1354,7 +1354,7 @@ services: if status == state.DoingStatus { break } - time.Sleep(20 * time.Millisecond) + time.Sleep(50 * time.Millisecond) } // Stop the daemon within the okayDelay, which should stop services in starting state. From 1e60a5174a566e790622f08a5dff611d634ea131 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Wed, 9 Oct 2024 12:04:01 +0800 Subject: [PATCH 11/11] chore: fix intermittent failed test case --- internals/daemon/daemon_test.go | 27 +++++++++++-------------- internals/overlord/servstate/manager.go | 2 +- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go index 045aa1b2b..58976c09c 100644 --- a/internals/daemon/daemon_test.go +++ b/internals/daemon/daemon_test.go @@ -40,6 +40,7 @@ import ( "github.com/canonical/pebble/internals/overlord" "github.com/canonical/pebble/internals/overlord/patch" "github.com/canonical/pebble/internals/overlord/restart" + "github.com/canonical/pebble/internals/overlord/servstate" "github.com/canonical/pebble/internals/overlord/standby" "github.com/canonical/pebble/internals/overlord/state" "github.com/canonical/pebble/internals/reaper" @@ -1339,25 +1340,21 @@ services: rsp.ServeHTTP(rec, req) c.Check(rec.Result().StatusCode, Equals, 202) - // Wait for the change to be in doing state so that the service is in starting state. - for i := 0; ; i++ { - if i >= 18 { - c.Fatalf("timed out waiting for change") - } - d.state.Lock() - change := d.state.Change(rsp.Change) - var status state.Status - if change != nil { - status = change.Status() - } - d.state.Unlock() - if status == state.DoingStatus { + // Waiting for the change to be in doing state cannot guarantee the service is + // in the starting state, so here we wait until the service is in the starting + // state. We wait up to 25*20=500ms to make sure there is still half a second + // left to stop the service before okayDelay. + for i := 0; i < 25; i++ { + svcInfo, err := d.overlord.ServiceManager().Services([]string{"test1"}) + c.Assert(err, IsNil) + if len(svcInfo) > 0 && svcInfo[0].Current == servstate.StatusActive { break } - time.Sleep(50 * time.Millisecond) + time.Sleep(20 * time.Millisecond) } - // Stop the daemon within the okayDelay, which should stop services in starting state. + // Stop the daemon, which should stop services in starting state. At this point, + // it should still be within the okayDelay. err = d.Stop(nil) c.Assert(err, IsNil) diff --git a/internals/overlord/servstate/manager.go b/internals/overlord/servstate/manager.go index 7b319aab1..ac22d38f7 100644 --- a/internals/overlord/servstate/manager.go +++ b/internals/overlord/servstate/manager.go @@ -367,7 +367,7 @@ func servicesToStop(m *ServiceManager) ([][]string, error) { return nil, err } - // Filter down to only those that are running or in backoff + // Filter down to only those that are starting, running or in backoff m.servicesLock.Lock() defer m.servicesLock.Unlock() var result [][]string