@@ -903,23 +903,44 @@ def _watcher(self) -> None:
903
903
logger .log (
904
904
level = 9 , msg = f"Submitting job { to_submit } "
905
905
)
906
- to_submit .htcondor_submit_result = (
907
- self ._client .submit (
908
- to_submit .htcondor_submit ,
909
- count = 1 ,
906
+ try :
907
+ to_submit .htcondor_submit_result = (
908
+ self ._client .submit (
909
+ to_submit .htcondor_submit ,
910
+ count = 1 ,
911
+ )
910
912
)
911
- )
913
+ except OSError as e :
914
+ # Something went wrong, continue and submit
915
+ # this later
916
+ logger .error (f"Error submitting job: { e } " )
917
+ logger .error (traceback .format_exc ())
918
+ logger .error ("Will try later." )
919
+
920
+ # Put the job back in the queue
921
+ self ._queued_jobs_list .appendleft (to_submit )
922
+
923
+ # Wait a bit before trying again
924
+ time .sleep (1 )
925
+ continue
926
+
912
927
logger .log (level = 9 , msg = "Getting cluster id." )
913
928
# Set the cluster id
914
929
to_submit .cluster_id = ( # type: ignore
915
930
to_submit .htcondor_submit_result .cluster ()
916
931
)
917
932
logger .log (level = 9 , msg = "Job submitted." )
918
- # Update the sent timestamp and cluster id
919
- logger .log (
920
- level = 9 , msg = "Updating task status timestamp."
921
- )
933
+ # Move to waiting jobs
934
+ self ._waiting_jobs_deque .append (to_submit )
935
+ newly_queued += 1
936
+ update_meta = True
937
+
922
938
if self ._export_metadata :
939
+ # Update the sent timestamp and cluster id
940
+ logger .log (
941
+ level = 9 ,
942
+ msg = "Updating task status timestamp." ,
943
+ )
923
944
self ._backend_meta .task_status [ # type: ignore
924
945
to_submit .task_id - 1
925
946
].sent_timestamp = datetime .now ()
@@ -932,11 +953,8 @@ def _watcher(self) -> None:
932
953
to_submit .task_id - 1
933
954
].cluster_id = to_submit .cluster_id
934
955
935
- logger .log (level = 9 , msg = "Task status updated" )
936
- # Move to waiting jobs
937
- self ._waiting_jobs_deque .append (to_submit )
938
- newly_queued += 1
939
- update_meta = True
956
+ logger .log (level = 9 , msg = "Task status updated" )
957
+
940
958
if update_meta and self ._export_metadata :
941
959
self .write_metadata ()
942
960
# logger.debug("Waiting 0.1 seconds")
0 commit comments