Skip to content

Commit

Permalink
PullRequest: 392 Add cleanup for FailOverContext when error recovery …
Browse files Browse the repository at this point in the history
…fails

Merge branch 'fix-worker-node-fo-context-cleanup-090dev of [email protected]:ray-project/mars.git into 0.9-dev

https://code.alipay.com/ray-project/mars/pull_requests/392


Signed-off-by: 慕白 <[email protected]>


* Add cleanup for FailOverContext when error recovery fails
  • Loading branch information
不涸 authored and zhongchun committed Dec 29, 2022
1 parent ca11010 commit 94ec1e8
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
3 changes: 3 additions & 0 deletions mars/services/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,8 @@ def enable_lineage(self):
def is_lineage_enabled(self):
return self._enable_lineage

def cleanup(self):
self.subtask_to_dependency_subtasks.clear()


FailOverContext = _FailOverContext()
7 changes: 6 additions & 1 deletion mars/services/task/execution/mars/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,10 +345,10 @@ async def _run(self):
return await self._get_stage_result()

async def cancel(self):
logger.info("Start to cancel stage %s of task %s.", self.stage_id, self.task)
if self._done.is_set() or self._cancelled.is_set(): # pragma: no cover
# already finished, ignore cancel
return
logger.info("Start to cancel stage %s of task %s.", self.stage_id, self.task)
self._cancelled.set()
# cancel running subtasks
await self._scheduling_api.cancel_subtasks(list(self._submitted_subtask_ids))
Expand Down Expand Up @@ -458,6 +458,7 @@ async def _detect_error(self, subtask, error, expect_error_cls_tuple):
if not FailOverContext.is_lineage_enabled():
logger.info("Lineage of failover is not enabled.")
return False

# Note: There are some error that do not need to be handled,
# like `DuplicatedSubtaskError`.
if isinstance(error, DuplicatedSubtaskError):
Expand Down Expand Up @@ -510,6 +511,7 @@ async def _detect_error(self, subtask, error, expect_error_cls_tuple):
s,
subtask,
)
FailOverContext.cleanup()
return False
if s not in dependency_subtasks:
order = await task_manager_ref.get_generation_order(
Expand All @@ -525,6 +527,7 @@ async def _detect_error(self, subtask, error, expect_error_cls_tuple):
"No dependent subtasks to restore of subtask %s.",
subtask.subtask_id,
)
FailOverContext.cleanup()
return False
priorities = [
(pri,) + s.priority
Expand All @@ -550,9 +553,11 @@ async def _detect_error(self, subtask, error, expect_error_cls_tuple):
)
return True
except:
FailOverContext.cleanup()
logger.exception("Error recovery failed.")
return False
else:
FailOverContext.cleanup()
logger.error("Could not to recover the error: %s", error)
return False

Expand Down

0 comments on commit 94ec1e8

Please sign in to comment.