diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml index 402cc1bf..b5e6441d 100644 --- a/.github/workflows/workflow.yml +++ b/.github/workflows/workflow.yml @@ -13,8 +13,9 @@ env: jobs: unit-tests: - + runs-on: ubuntu-latest + timeout-minutes: 20 strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] @@ -46,6 +47,7 @@ jobs: slurm: runs-on: ubuntu-latest + timeout-minutes: 20 steps: - uses: actions/checkout@v4 - name: Build the Slurm Docker image @@ -59,6 +61,7 @@ jobs: flux: runs-on: ubuntu-latest + timeout-minutes: 20 steps: - uses: actions/checkout@v4 - name: Pull the Flux Docker image @@ -72,6 +75,7 @@ jobs: pbs: runs-on: ubuntu-latest + timeout-minutes: 20 steps: - uses: actions/checkout@v4 - name: Pull the PBS Docker image @@ -85,6 +89,8 @@ jobs: lint: runs-on: ubuntu-latest + timeout-minutes: 5 + strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] @@ -130,6 +136,8 @@ jobs: security: runs-on: ubuntu-latest + timeout-minutes: 5 + strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] diff --git a/src/_canary/testcase.py b/src/_canary/testcase.py index d9b05d3d..3a74813e 100644 --- a/src/_canary/testcase.py +++ b/src/_canary/testcase.py @@ -40,6 +40,7 @@ from .paramset import ParameterSet from .status import Status from .util import filesystem as fs +from .util import kill_process_tree from .util import logging from .util._json import safeload from .util._json import safesave @@ -1756,7 +1757,7 @@ def cancel(sig, frame): self.tee_run_output(proc) get_process_metrics(proc, metrics=metrics) if timeout > 0 and time.monotonic() - start_marker > timeout: - os.kill(proc.pid, signal.SIGINT) + proc.send_signal(signal.SIGINT) raise TimeoutError time.sleep(sleep_interval) except MissingSourceError as e: @@ -1795,6 +1796,8 @@ def cancel(sig, frame): if metrics is not None: self.add_measurement(**metrics) logger.debug(f"{self}: finished with status {self.status}") + logger.debug(f"{self}: cleaning up resources (pid: {proc.pid})") + kill_process_tree(proc) self.log_to_stdout( f"Finished running {self.display_name} " f"in {self.duration} s. with exit code {self.returncode}" diff --git a/src/_canary/util/__init__.py b/src/_canary/util/__init__.py index b3a047d4..3379d1f3 100644 --- a/src/_canary/util/__init__.py +++ b/src/_canary/util/__init__.py @@ -5,6 +5,10 @@ import psutil +from . import logging + +logger = logging.get_logger(__name__) + def cpu_count(logical: bool | None = None) -> int: from .. import config # lazy import to avoid circular deps @@ -15,3 +19,31 @@ def cpu_count(logical: bool | None = None) -> int: if count is None: raise RuntimeError("Unable to determine the number of CPUs") return count + + +def _kill_child_processes(proc: psutil.Process) -> None: + try: + children: list[psutil.Process] = proc.children(recursive=True) + except psutil.NoSuchProcess: + children = [] + logger.debug(f"--> no child processes (root={proc.pid})") + + for child in children: + try: + child.kill() + logger.debug(f"--> killed child process ({child.pid}, root={proc.pid})") + except psutil.NoSuchProcess: + pass + + +def kill_process_tree(proc: psutil.Process | None) -> None: + """kill a process tree rooted by `proc`""" + if proc is None: + return + + logger.debug(f"Killing process tree (root={proc.pid})") + _kill_child_processes(proc) + try: + proc.kill() + except psutil.NoSuchProcess as e: + logger.debug(f"--> root process already finished ({e.pid})")