diff --git a/scripts/node_cleanup.py b/scripts/node_cleanup.py index f1f5b0be2..c260df8a6 100755 --- a/scripts/node_cleanup.py +++ b/scripts/node_cleanup.py @@ -13,13 +13,23 @@ def main(): stale = query.find_stale_locks(args.owner) if not stale: return + by_owner = {} + for node in stale: + if args.owner and node['locked_by'] != args.owner: + log.warning( + f"Node {node['name']} expected to be locked by {args.owner} " + f"but found {node['locked_by']} instead" + ) + continue + by_owner.setdefault(node['locked_by'], []).append(node) if args.dry_run: log.info("Would attempt to unlock:") - for node in stale: - log.info(f"{node['name']}\t{node['description']}") + for owner, nodes in by_owner.items(): + for node in nodes: + log.info(f"{node['name']}\t{node['description']}") else: - names = [node["name"] for node in stale] - ops.unlock_safe(names, args.owner) + for owner, nodes in by_owner.items(): + ops.unlock_safe([node["name"] for node in nodes], owner) def parse_args(argv): parser = argparse.ArgumentParser( diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 9ea3e7865..da21dc7d7 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -175,7 +175,6 @@ def lock_one(name, user=None, description=None): def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""): - # names = [misc.canonicalize_hostname(name, user=None) for name in names] with teuthology.parallel.parallel() as p: for name in names: p.spawn(unlock_one_safe, name, owner, run_name, job_id) diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py index 8e043b4db..731f8a1b5 100644 --- a/teuthology/lock/query.py +++ b/teuthology/lock/query.py @@ -2,7 +2,7 @@ import os import requests -from typing import Union +from typing import Dict, List, Union from teuthology import misc from teuthology.config import config @@ -87,7 +87,7 @@ def list_locks(keyed_by_name=False, tries=10, **kwargs): return dict() -def find_stale_locks(owner=None): +def find_stale_locks(owner=None) -> List[Dict]: """ Return a list of node dicts corresponding to nodes that were locked to run a job, but the job is no longer running. The purpose of this is to enable @@ -136,7 +136,7 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N :param node: The node dict as returned from the lock server :param cache: A set() used for caching results - :returns: True or False + :returns: A string if the node has an active job, or None if not """ status = status or get_status(name) if not status: @@ -144,6 +144,9 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N return "node had no status" description = status['description'] (run_name, job_id) = description.split('/')[-2:] + if not run_name or job_id == '': + # We thought this node might have a stale job, but no. + return "node description does not contained scheduled job info" url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/" job_status = "" with safe_while(