From dbc00ec68a1751f7ccad0bf7f9b2f98ef4208b8f Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Mon, 15 Sep 2025 11:25:39 -0600 Subject: [PATCH 1/2] lock.ops.lock_many: Unlock on reimage failure Signed-off-by: Zack Cerza --- teuthology/lock/ops.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 4fb6ba86a..289f90966 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -143,7 +143,12 @@ def lock_many(ctx, num, machine_type, user=None, description=None, update_nodes(ok_machs) return ok_machs elif reimage and machine_type in reimage_types: - return reimage_machines(ctx, machines, machine_type) + try: + return reimage_machines(ctx, machines, machine_type) + except Exception: + log.exception('Reimaging error. Unlocking machines...') + unlock_many(machines, user) + continue return machines elif response.status_code == 503: log.error('Insufficient nodes available to lock %d %s nodes.', From 7162e50f647aab1d9dd85c3bd6df8b3aeeedbf25 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Mon, 15 Sep 2025 11:26:11 -0600 Subject: [PATCH 2/2] supervisor.reimage: Change error message wording Signed-off-by: Zack Cerza --- teuthology/dispatcher/supervisor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index b89c39ac5..23c7d18c1 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -225,7 +225,7 @@ def reimage(job_config): try: reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type']) except Exception as e: - log.exception('Reimaging error. Nuking machines...') + log.exception('Reimaging error. Unlocking machines...') unlock_targets(job_config) # Reimage failures should map to the 'dead' status instead of 'fail' report.try_push_job_info(