Skip to content

Commit f9dd39f

Browse files
njzjzJinzhe Zeng
andauthored
fix a bug in repeat submission (#165)
* fix a bug in repeat submission When a job previously failed, retries submitting, but is not submitted, the job id should be cleaned. Otherwise, it will be considered failed again (although it isn't actually submitted.) * skip logging if it is not submitted Co-authored-by: Jinzhe Zeng <[email protected]>
1 parent 303d2b1 commit f9dd39f

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

dpdispatcher/submission.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -559,11 +559,12 @@ def handle_unexpected_job_state(self):
559559
if ( self.fail_count ) > 0 and ( self.fail_count % 3 == 0 ) :
560560
raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}")
561561
self.submit_job()
562-
dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id))
563-
time.sleep(0.2)
564-
self.get_job_state()
565-
dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}")
566-
self.handle_unexpected_job_state()
562+
if self.job_state != JobStatus.unsubmitted:
563+
dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id))
564+
time.sleep(0.2)
565+
self.get_job_state()
566+
dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}")
567+
self.handle_unexpected_job_state()
567568

568569
if job_state == JobStatus.unsubmitted:
569570
dlog.debug(f"job: {self.job_hash} unsubmitted; submit it")
@@ -610,8 +611,8 @@ def register_job_id(self, job_id):
610611

611612
def submit_job(self):
612613
job_id = self.machine.do_submit(self)
614+
self.register_job_id(job_id)
613615
if job_id:
614-
self.register_job_id(job_id)
615616
self.job_state = JobStatus.waiting
616617
else:
617618
self.job_state = JobStatus.unsubmitted

0 commit comments

Comments
 (0)