diff --git a/changelog/1327.bugfix.rst b/changelog/1327.bugfix.rst new file mode 100644 index 00000000..cb1c7ec3 --- /dev/null +++ b/changelog/1327.bugfix.rst @@ -0,0 +1 @@ +Fix hang with `--dist=loadgroup` if a crashed worker is replaced. \ No newline at end of file diff --git a/src/xdist/scheduler/loadscope.py b/src/xdist/scheduler/loadscope.py index 73162dcd..e59828e0 100644 --- a/src/xdist/scheduler/loadscope.py +++ b/src/xdist/scheduler/loadscope.py @@ -350,6 +350,12 @@ def schedule(self) -> None: # Initial distribution already happened, reschedule on all nodes if self.collection is not None: + for node in self.nodes: + self._reschedule(node) + # Ensure nodes have at least two work units if possible, + # since workers need a "next item" before running the current one. + # (A restarted worker has no item before calling _reschedule() + # for the first time.) for node in self.nodes: self._reschedule(node) return diff --git a/testing/acceptance_test.py b/testing/acceptance_test.py index 1b44985d..47023c9b 100644 --- a/testing/acceptance_test.py +++ b/testing/acceptance_test.py @@ -974,6 +974,27 @@ def test_b(): pass ] ) + def test_loadgroup_does_not_hang_after_restart2( + self, pytester: pytest.Pytester + ) -> None: + """Fix test suite never finishing in case a worker has to be restarted + if there is still work to be done (#1327).""" + f = pytester.makepyfile( + """ + import os + def test_a(): os._exit(1) + def test_b(): pass + """ + ) + res = pytester.runpytest(f, "-n1", "--dist=loadgroup") + res.stdout.fnmatch_lines( + [ + "replacing crashed worker gw*", + "worker*crashed while running*", + "*5 failed*", + ] + ) + def test_max_worker_restart(self, pytester: pytest.Pytester) -> None: f = pytester.makepyfile( """