From 5022dfbb112d96eae5c50f28414a16d69633b026 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 28 May 2024 13:08:57 -0600 Subject: [PATCH 1/3] lock.query.find_stale_locks: Fix a false positive Also fix the return type and docstring, which were inconsistent. Signed-off-by: Zack Cerza --- teuthology/lock/query.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py index 8e043b4db..731f8a1b5 100644 --- a/teuthology/lock/query.py +++ b/teuthology/lock/query.py @@ -2,7 +2,7 @@ import os import requests -from typing import Union +from typing import Dict, List, Union from teuthology import misc from teuthology.config import config @@ -87,7 +87,7 @@ def list_locks(keyed_by_name=False, tries=10, **kwargs): return dict() -def find_stale_locks(owner=None): +def find_stale_locks(owner=None) -> List[Dict]: """ Return a list of node dicts corresponding to nodes that were locked to run a job, but the job is no longer running. The purpose of this is to enable @@ -136,7 +136,7 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N :param node: The node dict as returned from the lock server :param cache: A set() used for caching results - :returns: True or False + :returns: A string if the node has an active job, or None if not """ status = status or get_status(name) if not status: @@ -144,6 +144,9 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N return "node had no status" description = status['description'] (run_name, job_id) = description.split('/')[-2:] + if not run_name or job_id == '': + # We thought this node might have a stale job, but no. + return "node description does not contained scheduled job info" url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/" job_status = "" with safe_while( From a02d01c52b44fa6013fe4778246107f80aeb0f23 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 28 May 2024 14:40:18 -0600 Subject: [PATCH 2/3] node_cleanup: Fix invocations without --owner Signed-off-by: Zack Cerza --- scripts/node_cleanup.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/scripts/node_cleanup.py b/scripts/node_cleanup.py index f1f5b0be2..c260df8a6 100755 --- a/scripts/node_cleanup.py +++ b/scripts/node_cleanup.py @@ -13,13 +13,23 @@ def main(): stale = query.find_stale_locks(args.owner) if not stale: return + by_owner = {} + for node in stale: + if args.owner and node['locked_by'] != args.owner: + log.warning( + f"Node {node['name']} expected to be locked by {args.owner} " + f"but found {node['locked_by']} instead" + ) + continue + by_owner.setdefault(node['locked_by'], []).append(node) if args.dry_run: log.info("Would attempt to unlock:") - for node in stale: - log.info(f"{node['name']}\t{node['description']}") + for owner, nodes in by_owner.items(): + for node in nodes: + log.info(f"{node['name']}\t{node['description']}") else: - names = [node["name"] for node in stale] - ops.unlock_safe(names, args.owner) + for owner, nodes in by_owner.items(): + ops.unlock_safe([node["name"] for node in nodes], owner) def parse_args(argv): parser = argparse.ArgumentParser( From 5072e7488a7ea4a603fde2b3257777cc5b873d4e Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 28 May 2024 14:45:54 -0600 Subject: [PATCH 3/3] lock.ops.unlock_safe: Remove stray comment Signed-off-by: Zack Cerza --- teuthology/lock/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 9ea3e7865..da21dc7d7 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -175,7 +175,6 @@ def lock_one(name, user=None, description=None): def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""): - # names = [misc.canonicalize_hostname(name, user=None) for name in names] with teuthology.parallel.parallel() as p: for name in names: p.spawn(unlock_one_safe, name, owner, run_name, job_id)