-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Preserve workflow task attempt when applying buffered events #8769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
aa6df49
3444228
feb640e
8c3a2d5
84f5e8c
2cd6eba
ddf398d
ee13ae0
da02cc2
a29e436
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ type WFTFailureReportedProblemsTestSuite struct { | |
| shouldFail atomic.Bool | ||
| failureCount atomic.Int32 | ||
| failureType atomic.Int32 // 0 = panic, 1 = non-deterministic error | ||
| stopSignals atomic.Bool | ||
| } | ||
|
|
||
| func TestWFTFailureReportedProblemsTestSuite(t *testing.T) { | ||
|
|
@@ -31,7 +32,7 @@ func TestWFTFailureReportedProblemsTestSuite(t *testing.T) { | |
|
|
||
| func (s *WFTFailureReportedProblemsTestSuite) SetupTest() { | ||
| s.FunctionalTestBase.SetupTest() | ||
| s.OverrideDynamicConfig(dynamicconfig.NumConsecutiveWorkflowTaskProblemsToTriggerSearchAttribute, 2) | ||
| s.OverrideDynamicConfig(dynamicconfig.NumConsecutiveWorkflowTaskProblemsToTriggerSearchAttribute, 3) | ||
| } | ||
|
|
||
| func (s *WFTFailureReportedProblemsTestSuite) simpleWorkflowWithShouldFail(ctx workflow.Context) (string, error) { | ||
|
|
@@ -45,6 +46,24 @@ func (s *WFTFailureReportedProblemsTestSuite) simpleActivity() (string, error) { | |
| return "done!", nil | ||
| } | ||
|
|
||
| // workflowWithSignalsThatFails creates a workflow that listens for signals and fails on each workflow task. | ||
| // This is used to test that the TemporalReportedProblems search attribute is not incorrectly removed | ||
| // when signals keep coming in despite continuous workflow task failures. | ||
| func (s *WFTFailureReportedProblemsTestSuite) workflowWithSignalsThatFails(ctx workflow.Context) (string, error) { | ||
| signalChan := workflow.GetSignalChannel(ctx, "test-signal") | ||
|
|
||
| var signalValue string | ||
| signalChan.Receive(ctx, &signalValue) | ||
|
Comment on lines
+53
to
+56
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't really need to handle the signals in the workflow, it doesn't add value in the test and prevents your workflow task from failing because you never reach the line where it panics. |
||
|
|
||
| // Always fail after receiving a signal, unless shouldFail is false | ||
| if s.shouldFail.Load() { | ||
| panic("forced-panic-after-signal") | ||
| } | ||
|
|
||
| // If we reach here, shouldFail is false, so we can complete | ||
| return "done!", nil | ||
| } | ||
|
|
||
| // workflowWithActivity creates a workflow that executes an activity before potentially failing. | ||
| // This is used to test workflow task failure scenarios in a more realistic context where the workflow | ||
| // has already executed some operations (activities) before encountering a workflow task failure. | ||
|
|
@@ -95,7 +114,7 @@ func (s *WFTFailureReportedProblemsTestSuite) TestWFTFailureReportedProblems_Set | |
| execution, err := s.SdkClient().DescribeWorkflowExecution(ctx, workflowRun.GetID(), workflowRun.GetRunID()) | ||
| require.NoError(t, err) | ||
| require.GreaterOrEqual(t, execution.PendingWorkflowTask.Attempt, int32(2)) | ||
| }, 5*time.Second, 500*time.Millisecond) | ||
| }, 20*time.Second, 500*time.Millisecond) | ||
|
|
||
| // Unblock the workflow | ||
| s.shouldFail.Store(false) | ||
|
|
@@ -110,6 +129,87 @@ func (s *WFTFailureReportedProblemsTestSuite) TestWFTFailureReportedProblems_Set | |
| s.False(ok) | ||
| } | ||
|
|
||
| func (s *WFTFailureReportedProblemsTestSuite) TestWFTFailureReportedProblems_NotClearedBySignals() { | ||
| ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) | ||
| defer cancel() | ||
|
|
||
| s.shouldFail.Store(true) | ||
| s.stopSignals.Store(false) | ||
|
|
||
| s.Worker().RegisterWorkflow(s.workflowWithSignalsThatFails) | ||
|
|
||
| workflowOptions := sdkclient.StartWorkflowOptions{ | ||
| ID: testcore.RandomizeStr("wf_id-" + s.T().Name()), | ||
| TaskQueue: s.TaskQueue(), | ||
| } | ||
|
|
||
| workflowRun, err := s.SdkClient().ExecuteWorkflow(ctx, workflowOptions, s.workflowWithSignalsThatFails) | ||
| s.NoError(err) | ||
|
|
||
| // Start sending signals every second in a goroutine | ||
| signalDone := make(chan struct{}) | ||
| go func() { | ||
| defer close(signalDone) | ||
| ticker := time.NewTicker(1 * time.Second) | ||
| defer ticker.Stop() | ||
|
|
||
| for { | ||
| select { | ||
| case <-ticker.C: | ||
| if s.stopSignals.Load() { | ||
| return | ||
| } | ||
| _ = s.SdkClient().SignalWorkflow(ctx, workflowRun.GetID(), workflowRun.GetRunID(), "test-signal", "ping") | ||
| case <-ctx.Done(): | ||
| return | ||
| } | ||
| } | ||
| }() | ||
|
|
||
| // Wait for the search attribute to be set due to consecutive failures | ||
| s.EventuallyWithT(func(t *assert.CollectT) { | ||
| description, err := s.SdkClient().DescribeWorkflow(ctx, workflowRun.GetID(), workflowRun.GetRunID()) | ||
| require.NoError(t, err) | ||
| saVal, ok := description.TypedSearchAttributes.GetKeywordList(temporal.NewSearchAttributeKeyKeywordList(sadefs.TemporalReportedProblems)) | ||
| require.True(t, ok) | ||
| require.NotEmpty(t, saVal) | ||
| require.Contains(t, saVal, "category=WorkflowTaskFailed") | ||
| require.Contains(t, saVal, "cause=WorkflowTaskFailedCauseWorkflowWorkerUnhandledFailure") | ||
| }, 30*time.Second, 500*time.Millisecond) | ||
|
|
||
| // Continue sending signals for a few more seconds and verify the search attribute is NOT removed | ||
| // This is the key part of the test - signals should not cause the search attribute to be cleared | ||
|
|
||
| s.EventuallyWithT(func(t *assert.CollectT) { | ||
| description, err := s.SdkClient().DescribeWorkflow(ctx, workflowRun.GetID(), workflowRun.GetRunID()) | ||
| s.NoError(err) | ||
| saVal, ok := description.TypedSearchAttributes.GetKeywordList(temporal.NewSearchAttributeKeyKeywordList(sadefs.TemporalReportedProblems)) | ||
| s.True(ok, "Search attribute should still be present after receiving signals") | ||
| s.NotEmpty(saVal, "Search attribute should not be empty after receiving signals") | ||
| }, 5*time.Second, 500*time.Millisecond) | ||
|
|
||
| // Stop signals and unblock the workflow for cleanup | ||
| s.stopSignals.Store(true) | ||
| s.shouldFail.Store(false) | ||
|
|
||
| // Send one final signal to trigger workflow completion | ||
| err = s.SdkClient().SignalWorkflow(ctx, workflowRun.GetID(), workflowRun.GetRunID(), "test-signal", "final") | ||
| s.NoError(err) | ||
|
|
||
| var out string | ||
| s.NoError(workflowRun.Get(ctx, &out)) | ||
|
|
||
| // Wait for signal goroutine to finish | ||
| <-signalDone | ||
|
|
||
| // Verify search attribute is cleared after successful completion | ||
| description, err := s.SdkClient().DescribeWorkflow(ctx, workflowRun.GetID(), workflowRun.GetRunID()) | ||
| s.NoError(err) | ||
| s.NotNil(description.TypedSearchAttributes) | ||
| _, ok := description.TypedSearchAttributes.GetKeywordList(temporal.NewSearchAttributeKeyKeywordList(sadefs.TemporalReportedProblems)) | ||
| s.False(ok) | ||
| } | ||
|
|
||
| func (s *WFTFailureReportedProblemsTestSuite) TestWFTFailureReportedProblems_SetAndClear_FailAfterActivity() { | ||
| ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) | ||
| defer cancel() | ||
|
|
@@ -137,7 +237,7 @@ func (s *WFTFailureReportedProblemsTestSuite) TestWFTFailureReportedProblems_Set | |
| require.Len(t, saValues, 2) | ||
| require.Contains(t, saValues, "category=WorkflowTaskFailed") | ||
| require.Contains(t, saValues, "cause=WorkflowTaskFailedCauseWorkflowWorkerUnhandledFailure") | ||
| }, 5*time.Second, 500*time.Millisecond) | ||
| }, 20*time.Second, 500*time.Millisecond) | ||
|
|
||
| // Unblock the workflow | ||
| s.shouldFail.Store(false) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you update WorkflowTaskAttemptsSinceLastSuccess every time there is a WFT failure instead of only doing it when flushing buffered events? That seems more intuitive behavior to keep this attribute up-to-date.