From 1eb3f00bb3b62e9992775e76a60ab77c47d63452 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 4 Mar 2025 17:03:39 -0700 Subject: [PATCH 1/4] util.time.parse_timestamp(): Add alt. format Paddles uses this format. Signed-off-by: Zack Cerza --- teuthology/util/time.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/teuthology/util/time.py b/teuthology/util/time.py index 8e0525fcc3..eb74696025 100644 --- a/teuthology/util/time.py +++ b/teuthology/util/time.py @@ -4,18 +4,23 @@ # When we're not using ISO format, we're using this TIMESTAMP_FMT = "%Y-%m-%d_%H:%M:%S" +# Or this, in the case of paddles timestamps +TIMESTAMP_FMT_ALT = "%Y-%m-%d %H:%M:%S.%f" def parse_timestamp(timestamp: str) -> datetime: """ - timestamp: A string either in ISO 8601 format or TIMESTAMP_FMT. - If no timezone is specified, UTC is assumed. + timestamp: A string either in ISO 8601 format, TIMESTAMP_FMT or + TIMESTAMP_FMT_ALT. If no timezone is specified, UTC is assumed. :returns: a datetime object """ try: dt = datetime.fromisoformat(timestamp) except ValueError: - dt = datetime.strptime(timestamp, TIMESTAMP_FMT) + try: + dt = datetime.strptime(timestamp, TIMESTAMP_FMT) + except ValueError: + dt = datetime.strptime(timestamp, TIMESTAMP_FMT_ALT) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt From bd1aec7ea7f4dec8b8149402439fa465f4099981 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 4 Mar 2025 17:58:31 -0700 Subject: [PATCH 2/4] Remove run name, job id from unlock_* calls This was simply faulty logic. One effect was that the supervisor would refuse to unlock nodes that were used by its own job. Signed-off-by: Zack Cerza --- teuthology/dispatcher/__init__.py | 7 +------ teuthology/dispatcher/supervisor.py | 2 +- teuthology/kill.py | 4 ++-- teuthology/lock/ops.py | 11 ++++------- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py index 59f8ae3279..e1027ffc9c 100644 --- a/teuthology/dispatcher/__init__.py +++ b/teuthology/dispatcher/__init__.py @@ -182,12 +182,7 @@ def main(args): log.exception(error_message) if 'targets' in job_config: node_names = job_config["targets"].keys() - lock_ops.unlock_safe( - node_names, - job_config["owner"], - job_config["name"], - job_config["job_id"] - ) + lock_ops.unlock_safe(node_names, job_config["owner"]) report.try_push_job_info(job_config, dict( status='fail', failure_reason=error_message)) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index b89c39ac5a..1936fa636f 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -269,7 +269,7 @@ def unlock_targets(job_config): return if job_config.get("unlock_on_failure", True): log.info('Unlocking machines...') - lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"]) + lock_ops.unlock_safe(locked, job_config["owner"]) def run_with_watchdog(process, job_config): diff --git a/teuthology/kill.py b/teuthology/kill.py index 137e49080e..0914a6f9a1 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -75,7 +75,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, if owner is not None: targets = find_targets(run_name) names = list(targets.keys()) - lock_ops.unlock_safe(names, owner, run_name) + lock_ops.unlock_safe(names, owner) report.try_mark_run_dead(run_name) @@ -103,7 +103,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False) log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus") if not skip_unlock: targets = find_targets(run_name, job_id) - lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id) + lock_ops.unlock_safe(list(targets.keys()), owner) def find_run_info(serializer, run_name): diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 52dce68575..3b945df120 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -174,24 +174,21 @@ def lock_one(name, user=None, description=None): return response -def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""): +def unlock_safe(names: List[str], owner: str): with teuthology.parallel.parallel() as p: for name in names: - p.spawn(unlock_one_safe, name, owner, run_name, job_id) + p.spawn(unlock_one_safe, name, owner) return all(p) -def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool: +def unlock_one_safe(name: str, owner: str) -> bool: node_status = query.get_status(name) if node_status.get("locked", False) is False: - log.warn(f"Refusing to unlock {name} since it is already unlocked") + log.debug(f"Refusing to unlock {name} since it is already unlocked") return False maybe_job = query.node_active_job(name, node_status) if not maybe_job: return unlock_one(name, owner, node_status["description"], node_status) - if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"): - log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}") - return False log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}") return False From 7443f9e670c86fa9fddf74c3b51fcad90e16ca04 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 4 Mar 2025 18:02:41 -0700 Subject: [PATCH 3/4] lock.ops.unlock_many: Add type hints Signed-off-by: Zack Cerza --- teuthology/lock/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 3b945df120..78d082a96b 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -193,13 +193,13 @@ def unlock_one_safe(name: str, owner: str) -> bool: return False -def unlock_many(names, user): +def unlock_many(names: List[str], owner: str): fixed_names = [misc.canonicalize_hostname(name, user=None) for name in names] names = fixed_names uri = os.path.join(config.lock_server, 'nodes', 'unlock_many', '') data = dict( - locked_by=user, + locked_by=owner, names=names, ) with safe_while( From 2090cb42810331d160c15b6bf19eb4621770b311 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 4 Mar 2025 18:03:25 -0700 Subject: [PATCH 4/4] supervisor: Don't use "safe" unlock methods The supervisor knows when its job's nodes are no longer needed. Signed-off-by: Zack Cerza --- teuthology/dispatcher/supervisor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index 1936fa636f..823196f32b 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -269,7 +269,7 @@ def unlock_targets(job_config): return if job_config.get("unlock_on_failure", True): log.info('Unlocking machines...') - lock_ops.unlock_safe(locked, job_config["owner"]) + lock_ops.unlock_many(locked, job_config["owner"]) def run_with_watchdog(process, job_config):