From ad5045cd65358ff3a3648185c1f4afbfad5ec086 Mon Sep 17 00:00:00 2001 From: justinp <174377431+justinp-tt@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:51:55 -0600 Subject: [PATCH 1/3] Ignore state machine not found during sync --- .../history/replication/executable_sync_hsm_task.go | 12 ++++++++++++ .../replication/executable_sync_hsm_task_test.go | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/service/history/replication/executable_sync_hsm_task.go b/service/history/replication/executable_sync_hsm_task.go index f9ad26dab95..d81835ee372 100644 --- a/service/history/replication/executable_sync_hsm_task.go +++ b/service/history/replication/executable_sync_hsm_task.go @@ -24,6 +24,7 @@ package replication import ( "context" + "errors" "time" "go.temporal.io/api/serviceerror" @@ -37,6 +38,7 @@ import ( "go.temporal.io/server/common/namespace" serviceerrors "go.temporal.io/server/common/serviceerror" ctasks "go.temporal.io/server/common/tasks" + "go.temporal.io/server/service/history/hsm" "go.temporal.io/server/service/history/shard" "google.golang.org/protobuf/types/known/timestamppb" ) @@ -166,6 +168,16 @@ func (e *ExecutableSyncHSMTask) HandleErr(err error) error { } return e.Execute() default: + if errors.Is(err, hsm.ErrStateMachineNotFound) { + e.Logger.Debug("Dropped sync HSM task due to missing state machine - likely deleted", + tag.WorkflowNamespaceID(e.NamespaceID), + tag.WorkflowID(e.WorkflowID), + tag.WorkflowRunID(e.RunID), + tag.TaskID(e.ExecutableTask.TaskID()), + ) + return nil + } + e.Logger.Error("Sync HSM replication task encountered error", tag.WorkflowNamespaceID(e.NamespaceID), tag.WorkflowID(e.WorkflowID), diff --git a/service/history/replication/executable_sync_hsm_task_test.go b/service/history/replication/executable_sync_hsm_task_test.go index 076f4584b5b..fa1a5fb3d95 100644 --- a/service/history/replication/executable_sync_hsm_task_test.go +++ b/service/history/replication/executable_sync_hsm_task_test.go @@ -24,6 +24,7 @@ package replication import ( "errors" + "fmt" "math/rand" "testing" "time" @@ -45,6 +46,7 @@ import ( serviceerrors "go.temporal.io/server/common/serviceerror" "go.temporal.io/server/common/xdc" "go.temporal.io/server/service/history/configs" + "go.temporal.io/server/service/history/hsm" "go.temporal.io/server/service/history/shard" "go.temporal.io/server/service/history/tests" "go.uber.org/mock/gomock" @@ -294,3 +296,13 @@ func (s *executableSyncHSMTaskSuite) TestMarkPoisonPill() { VisibilityTime: timestamppb.New(s.task.TaskCreationTime()), }, replicationTask.RawTaskInfo) } + +func (s *executableSyncHSMTaskSuite) TestHandleErr_StateMachineNotFound() { + s.executableTask.EXPECT().GetNamespaceInfo(gomock.Any(), s.task.NamespaceID).Return(uuid.NewString(), true, nil).AnyTimes() + + err := fmt.Errorf("wrapper: %w", hsm.ErrStateMachineNotFound) + + actualErr := s.task.HandleErr(err) + + s.NoError(actualErr) +} From bc2fdd2e91872d185abcf9f051c478077ef4dbf6 Mon Sep 17 00:00:00 2001 From: justinp <174377431+justinp-tt@users.noreply.github.com> Date: Mon, 23 Dec 2024 13:34:16 -0600 Subject: [PATCH 2/3] Revert "Ignore state machine not found during sync" This reverts commit ad5045cd65358ff3a3648185c1f4afbfad5ec086. --- .../history/replication/executable_sync_hsm_task.go | 12 ------------ .../replication/executable_sync_hsm_task_test.go | 12 ------------ 2 files changed, 24 deletions(-) diff --git a/service/history/replication/executable_sync_hsm_task.go b/service/history/replication/executable_sync_hsm_task.go index d81835ee372..f9ad26dab95 100644 --- a/service/history/replication/executable_sync_hsm_task.go +++ b/service/history/replication/executable_sync_hsm_task.go @@ -24,7 +24,6 @@ package replication import ( "context" - "errors" "time" "go.temporal.io/api/serviceerror" @@ -38,7 +37,6 @@ import ( "go.temporal.io/server/common/namespace" serviceerrors "go.temporal.io/server/common/serviceerror" ctasks "go.temporal.io/server/common/tasks" - "go.temporal.io/server/service/history/hsm" "go.temporal.io/server/service/history/shard" "google.golang.org/protobuf/types/known/timestamppb" ) @@ -168,16 +166,6 @@ func (e *ExecutableSyncHSMTask) HandleErr(err error) error { } return e.Execute() default: - if errors.Is(err, hsm.ErrStateMachineNotFound) { - e.Logger.Debug("Dropped sync HSM task due to missing state machine - likely deleted", - tag.WorkflowNamespaceID(e.NamespaceID), - tag.WorkflowID(e.WorkflowID), - tag.WorkflowRunID(e.RunID), - tag.TaskID(e.ExecutableTask.TaskID()), - ) - return nil - } - e.Logger.Error("Sync HSM replication task encountered error", tag.WorkflowNamespaceID(e.NamespaceID), tag.WorkflowID(e.WorkflowID), diff --git a/service/history/replication/executable_sync_hsm_task_test.go b/service/history/replication/executable_sync_hsm_task_test.go index fa1a5fb3d95..076f4584b5b 100644 --- a/service/history/replication/executable_sync_hsm_task_test.go +++ b/service/history/replication/executable_sync_hsm_task_test.go @@ -24,7 +24,6 @@ package replication import ( "errors" - "fmt" "math/rand" "testing" "time" @@ -46,7 +45,6 @@ import ( serviceerrors "go.temporal.io/server/common/serviceerror" "go.temporal.io/server/common/xdc" "go.temporal.io/server/service/history/configs" - "go.temporal.io/server/service/history/hsm" "go.temporal.io/server/service/history/shard" "go.temporal.io/server/service/history/tests" "go.uber.org/mock/gomock" @@ -296,13 +294,3 @@ func (s *executableSyncHSMTaskSuite) TestMarkPoisonPill() { VisibilityTime: timestamppb.New(s.task.TaskCreationTime()), }, replicationTask.RawTaskInfo) } - -func (s *executableSyncHSMTaskSuite) TestHandleErr_StateMachineNotFound() { - s.executableTask.EXPECT().GetNamespaceInfo(gomock.Any(), s.task.NamespaceID).Return(uuid.NewString(), true, nil).AnyTimes() - - err := fmt.Errorf("wrapper: %w", hsm.ErrStateMachineNotFound) - - actualErr := s.task.HandleErr(err) - - s.NoError(actualErr) -} From 442ef67c6b860e9674133a2bf2b0a22bd2c0d13c Mon Sep 17 00:00:00 2001 From: justinp <174377431+justinp-tt@users.noreply.github.com> Date: Mon, 23 Dec 2024 13:34:35 -0600 Subject: [PATCH 3/3] Ignore state machine not found during sync --- service/history/ndc/hsm_state_replicator.go | 17 ++++++-- .../history/ndc/hsm_state_replicator_test.go | 41 +++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/service/history/ndc/hsm_state_replicator.go b/service/history/ndc/hsm_state_replicator.go index 722a9c2ce9d..abb9c447711 100644 --- a/service/history/ndc/hsm_state_replicator.go +++ b/service/history/ndc/hsm_state_replicator.go @@ -183,10 +183,19 @@ func (r *HSMStateReplicatorImpl) syncHSMNode( incomingNodePath := incomingNode.Path() currentNode, err := currentHSM.Child(incomingNodePath) if err != nil { - // 1. Already done history resend if needed before, - // and node creation today always associated with an event - // 2. Node deletion is not supported right now. - // Based on 1 and 2, node should always be found here. + // The node may not be found if: + // 1. The state machine was deleted (e.g. terminal state cleanup) + // 2. We're missing events that created this node + if errors.Is(err, hsm.ErrStateMachineNotFound) { + // In terminal state, nodes can be deleted + // Ignore the error and continue processing other nodes + r.logger.Debug("State machine not found - likely deleted in terminal state", + tag.WorkflowNamespaceID(mutableState.GetExecutionInfo().NamespaceId), + tag.WorkflowID(mutableState.GetExecutionInfo().WorkflowId), + tag.WorkflowRunID(mutableState.GetExecutionInfo().OriginalExecutionRunId), + ) + return nil + } return err } diff --git a/service/history/ndc/hsm_state_replicator_test.go b/service/history/ndc/hsm_state_replicator_test.go index c623627c448..38fd47a2554 100644 --- a/service/history/ndc/hsm_state_replicator_test.go +++ b/service/history/ndc/hsm_state_replicator_test.go @@ -702,6 +702,47 @@ func (s *hsmStateReplicatorSuite) TestSyncHSM_IncomingStateNewer_WorkflowClosed( s.NoError(err) } +func (s *hsmStateReplicatorSuite) TestSyncHSM_StateMachineNotFound() { + persistedState := s.buildWorkflowMutableState() + // Remove the child1 state machine so it doesn't exist + delete(persistedState.ExecutionInfo.SubStateMachinesByType[s.stateMachineDef.Type()].MachinesById, "child1") + + s.mockExecutionMgr.EXPECT().GetWorkflowExecution(gomock.Any(), &persistence.GetWorkflowExecutionRequest{ + ShardID: s.mockShard.GetShardID(), + NamespaceID: s.workflowKey.NamespaceID, + WorkflowID: s.workflowKey.WorkflowID, + RunID: s.workflowKey.RunID, + }).Return(&persistence.GetWorkflowExecutionResponse{ + State: persistedState, + DBRecordVersion: 777, + }, nil).Times(1) + + err := s.nDCHSMStateReplicator.SyncHSMState(context.Background(), &shard.SyncHSMRequest{ + WorkflowKey: s.workflowKey, + EventVersionHistory: persistedState.ExecutionInfo.VersionHistories.Histories[0], + StateMachineNode: &persistencespb.StateMachineNode{ + Children: map[string]*persistencespb.StateMachineMap{ + s.stateMachineDef.Type(): { + MachinesById: map[string]*persistencespb.StateMachineNode{ + "child1": { + Data: []byte(hsmtest.State3), + InitialVersionedTransition: &persistencespb.VersionedTransition{ + NamespaceFailoverVersion: s.namespaceEntry.FailoverVersion(), + }, + LastUpdateVersionedTransition: &persistencespb.VersionedTransition{ + NamespaceFailoverVersion: s.namespaceEntry.FailoverVersion() + 100, + }, + TransitionCount: 50, + }, + }, + }, + }, + }, + }) + + s.NoError(err) // Expect no error as we should gracefully handle missing state machines +} + func (s *hsmStateReplicatorSuite) buildWorkflowMutableState() *persistencespb.WorkflowMutableState { info := &persistencespb.WorkflowExecutionInfo{