Skip to content

Commit 0870a01

Browse files
committed
Consider errors from htcondor scheduler
1 parent 1bd253c commit 0870a01

File tree

1 file changed

+32
-14
lines changed

1 file changed

+32
-14
lines changed

joblib_htcondor/backend.py

+32-14
Original file line numberDiff line numberDiff line change
@@ -903,23 +903,44 @@ def _watcher(self) -> None:
903903
logger.log(
904904
level=9, msg=f"Submitting job {to_submit}"
905905
)
906-
to_submit.htcondor_submit_result = (
907-
self._client.submit(
908-
to_submit.htcondor_submit,
909-
count=1,
906+
try:
907+
to_submit.htcondor_submit_result = (
908+
self._client.submit(
909+
to_submit.htcondor_submit,
910+
count=1,
911+
)
910912
)
911-
)
913+
except OSError as e:
914+
# Something went wrong, continue and submit
915+
# this later
916+
logger.error(f"Error submitting job: {e}")
917+
logger.error(traceback.format_exc())
918+
logger.error("Will try later.")
919+
920+
# Put the job back in the queue
921+
self._queued_jobs_list.appendleft(to_submit)
922+
923+
# Wait a bit before trying again
924+
time.sleep(1)
925+
continue
926+
912927
logger.log(level=9, msg="Getting cluster id.")
913928
# Set the cluster id
914929
to_submit.cluster_id = ( # type: ignore
915930
to_submit.htcondor_submit_result.cluster()
916931
)
917932
logger.log(level=9, msg="Job submitted.")
918-
# Update the sent timestamp and cluster id
919-
logger.log(
920-
level=9, msg="Updating task status timestamp."
921-
)
933+
# Move to waiting jobs
934+
self._waiting_jobs_deque.append(to_submit)
935+
newly_queued += 1
936+
update_meta = True
937+
922938
if self._export_metadata:
939+
# Update the sent timestamp and cluster id
940+
logger.log(
941+
level=9,
942+
msg="Updating task status timestamp.",
943+
)
923944
self._backend_meta.task_status[ # type: ignore
924945
to_submit.task_id - 1
925946
].sent_timestamp = datetime.now()
@@ -932,11 +953,8 @@ def _watcher(self) -> None:
932953
to_submit.task_id - 1
933954
].cluster_id = to_submit.cluster_id
934955

935-
logger.log(level=9, msg="Task status updated")
936-
# Move to waiting jobs
937-
self._waiting_jobs_deque.append(to_submit)
938-
newly_queued += 1
939-
update_meta = True
956+
logger.log(level=9, msg="Task status updated")
957+
940958
if update_meta and self._export_metadata:
941959
self.write_metadata()
942960
# logger.debug("Waiting 0.1 seconds")

0 commit comments

Comments
 (0)