-
Notifications
You must be signed in to change notification settings - Fork 15
fix(sdk):Parity on checkpoint errors #135
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,10 @@ | |
| import time | ||
| from dataclasses import dataclass | ||
| from enum import Enum | ||
| from typing import TYPE_CHECKING, Self, TypedDict | ||
| from typing import TYPE_CHECKING, Literal, Self, TypedDict | ||
|
|
||
| BAD_REQUEST_ERROR: int = 400 | ||
| SERVICE_ERROR: int = 500 | ||
|
|
||
| if TYPE_CHECKING: | ||
| import datetime | ||
|
|
@@ -22,7 +25,7 @@ class AwsErrorObj(TypedDict): | |
| class AwsErrorMetadata(TypedDict): | ||
| RequestId: str | None | ||
| HostId: str | None | ||
| HTTPStatusCode: str | None | ||
| HTTPStatusCode: int | None | ||
| HTTPHeaders: str | None | ||
| RetryAttempts: str | None | ||
|
|
||
|
|
@@ -121,12 +124,16 @@ def __init__(self, message: str, step_id: str | None = None): | |
| self.step_id = step_id | ||
|
|
||
|
|
||
| CheckpointErrorKind = Literal["Execution", "Invocation"] | ||
|
|
||
|
|
||
| class CheckpointError(BotoClientError): | ||
| """Failure to checkpoint. Will terminate the lambda.""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| message: str, | ||
| error_kind: CheckpointErrorKind, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. .category = ErrorCategory.INVOCATION |
||
| error: AwsErrorObj | None = None, | ||
| response_metadata: AwsErrorMetadata | None = None, | ||
| ): | ||
|
|
@@ -136,6 +143,40 @@ def __init__( | |
| response_metadata, | ||
| termination_reason=TerminationReason.CHECKPOINT_FAILED, | ||
| ) | ||
| self.error_kind: CheckpointErrorKind = error_kind | ||
|
|
||
| @classmethod | ||
| def from_exception(cls, exception: Exception) -> CheckpointError: | ||
| base = BotoClientError.from_exception(exception) | ||
| metadata: AwsErrorMetadata | None = base.response_metadata | ||
| error: AwsErrorObj | None = base.error | ||
| error_kind: CheckpointErrorKind = "Invocation" | ||
|
|
||
| # InvalidParameterValueException and error message starts with "Invalid Checkpoint Token" is an InvocationError | ||
| # all other 4xx errors are Execution Errors and should be retried | ||
| # all 5xx errors are Invocation Errors | ||
| status_code: int | None = (metadata and metadata.get("HTTPStatusCode")) or None | ||
| if ( | ||
| status_code | ||
| # if we are in 4xx range and is not an InvalidParameterValueException with Invalid Checkpoint Token | ||
| # then it's an execution error | ||
| and status_code < SERVICE_ERROR | ||
| and status_code >= BAD_REQUEST_ERROR | ||
| and error | ||
| and ( | ||
| # is not InvalidParam => Execution | ||
| (error.get("Code", "") or "") != "InvalidParameterValueException" | ||
| # is not Invalid Token => Execution | ||
| or not (error.get("Message") or "").startswith( | ||
| "Invalid Checkpoint Token" | ||
| ) | ||
| ) | ||
| ): | ||
| error_kind = "Execution" | ||
| return CheckpointError(str(exception), error_kind, error, metadata) | ||
|
|
||
| def should_be_retried(self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is_retriable |
||
| return self.error_kind == "Execution" | ||
|
|
||
|
|
||
| class ValidationError(DurableExecutionsError): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -321,6 +321,16 @@ def wrapper(event: Any, context: LambdaContext) -> MutableMapping[str, Any]: | |
| else: | ||
| logger.exception("Checkpoint processing failed") | ||
| # Raise the original exception | ||
| if ( | ||
| isinstance(bg_error.source_exception, CheckpointError) | ||
| and bg_error.source_exception.should_be_retried() | ||
| ): | ||
| raise bg_error.source_exception from None # Terminate Lambda immediately and have it be retried | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We use |
||
| if isinstance(bg_error.source_exception, CheckpointError): | ||
| return DurableExecutionInvocationOutput( | ||
| status=InvocationStatus.FAILED, | ||
| error=ErrorObject.from_exception(bg_error.source_exception), | ||
| ).to_dict() | ||
| raise bg_error.source_exception from bg_error | ||
|
|
||
| except SuspendExecution: | ||
|
|
@@ -336,7 +346,11 @@ def wrapper(event: Any, context: LambdaContext) -> MutableMapping[str, Any]: | |
| "Checkpoint system failed", | ||
| extra=e.build_logger_extras(), | ||
| ) | ||
| raise # Terminate Lambda immediately | ||
| if e.should_be_retried(): | ||
| raise # Terminate Lambda immediately and have it be retried | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we don't do from since this is an unwrapped CheckpointError. |
||
| return DurableExecutionInvocationOutput( | ||
| status=InvocationStatus.FAILED, error=ErrorObject.from_exception(e) | ||
| ).to_dict() | ||
| except InvocationError: | ||
| logger.exception("Invocation error. Must terminate.") | ||
| # Throw the error to trigger Lambda retry | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not enum?