From 74c33eacf4325b6e0f3e3b11fc6c50b4222921a1 Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Wed, 17 Jan 2024 23:07:12 +0800 Subject: [PATCH] [active-standby] Probe the link in suspend timeout (#235) Approach What is the motivation for this PR? Fix the issue: After the link probe unknown, the standby ToR fails to take the link. When the link recovers, the active ToR is stuck in heartbeat suspension. Work item tracking Microsoft ADO (number only): 26281338 How did you do it? Let's probe the link in suspension timeout if current state is (unknown, active, up). Kudos to @zjswhhh to point out we should force sending the heartbeat in link detect. How did you verify/test it? UT and verify on testbed. Any platform specific information? --- .../LinkManagerStateMachineActiveStandby.cpp | 6 ++++ .../LinkManagerStateMachineActiveStandby.h | 12 +++++++ src/link_prober/LinkProber.cpp | 23 +++++++++--- src/link_prober/LinkProber.h | 13 ++++++- test/FakeLinkProber.cpp | 7 ++++ test/FakeLinkProber.h | 2 ++ test/FakeMuxPort.cpp | 3 ++ test/LinkManagerStateMachineTest.cpp | 35 +++++++++++++++++++ 8 files changed, 95 insertions(+), 6 deletions(-) diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp index d6008073..52bd2d6f 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp @@ -366,6 +366,9 @@ void ActiveStandbyStateMachine::handleSwssBladeIpv4AddressUpdate(boost::asio::ip mProbePeerTorFnPtr = boost::bind( &link_prober::LinkProber::probePeerTor, mLinkProberPtr.get() ); + mDetectLinkFnPtr = boost::bind( + &link_prober::LinkProber::detectLink, mLinkProberPtr.get() + ); mSuspendTxFnPtr = boost::bind( &link_prober::LinkProber::suspendTxProbes, mLinkProberPtr.get(), boost::placeholders::_1 ); @@ -490,6 +493,7 @@ void ActiveStandbyStateMachine::handleStateChange(LinkProberEvent &event, link_p if (ps(mCompositeState) != link_prober::LinkProberState::Unknown) { mResumeTxFnPtr(); + mUnknownActiveUpBackoffFactor = 1; } updateMuxLinkmgrState(); @@ -829,6 +833,8 @@ void ActiveStandbyStateMachine::handleSuspendTimerExpiry() CompositeState currState = mCompositeState; enterMuxWaitState(mCompositeState); LOGINFO_MUX_STATE_TRANSITION(mMuxPortConfig.getPortName(), currState, mCompositeState); + // detect if link/server recovers + mDetectLinkFnPtr(); } else { mUnknownActiveUpBackoffFactor = 1; } diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.h b/src/link_manager/LinkManagerStateMachineActiveStandby.h index b7916a43..147b952a 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.h +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.h @@ -755,6 +755,17 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase, */ void setProbePeerTorFnPtr(boost::function probePeerTorFnPtr) {mProbePeerTorFnPtr = probePeerTorFnPtr;}; + /** + *@method setDetectLinkFnPtr + * + *@brief set new DetectLinkFnPtr for the state machine. This method is used for testing + * + *@param detectLinkFnPtr (in) pointer to new DetectLinkFnPtr + * + *@return none + */ + void setDetectLinkFnPtr(boost::function detectLinkFnPtr) {mDetectLinkFnPtr = detectLinkFnPtr;}; + /** *@method setSuspendTxFnPtr * @@ -852,6 +863,7 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase, boost::function mInitializeProberFnPtr; boost::function mStartProbingFnPtr; boost::function mProbePeerTorFnPtr; + boost::function mDetectLinkFnPtr; boost::function mSuspendTxFnPtr; boost::function mResumeTxFnPtr; boost::function mSendPeerSwitchCommandFnPtr; diff --git a/src/link_prober/LinkProber.cpp b/src/link_prober/LinkProber.cpp index 98128b2f..26f3ed88 100644 --- a/src/link_prober/LinkProber.cpp +++ b/src/link_prober/LinkProber.cpp @@ -231,7 +231,21 @@ void LinkProber::updateEthernetFrame() void LinkProber::probePeerTor() { boost::asio::io_service &ioService = mStrand.context(); - ioService.post(mStrand.wrap(boost::bind(&LinkProber::sendHeartbeat, this))); + ioService.post(mStrand.wrap(boost::bind(&LinkProber::sendHeartbeat, this, false))); +} + +// +// ---> detectLink(); +// +// send HBs to detect the link status +// +void LinkProber::detectLink() +{ + boost::asio::io_service &ioService = mStrand.context(); + for (uint32_t i = 0; i < mMuxPortConfig.getPositiveStateChangeRetryCount(); ++i) + { + ioService.post(mStrand.wrap(boost::bind(&LinkProber::sendHeartbeat, this, true))); + } } // @@ -304,18 +318,17 @@ void LinkProber::handleSendProbeCommand() } // -// ---> sendHeartbeat() +// ---> sendHeartbeat(bool forceSend) // // send ICMP ECHOREQUEST packet // -void LinkProber::sendHeartbeat() +void LinkProber::sendHeartbeat(bool forceSend) { MUXLOGTRACE(mMuxPortConfig.getPortName()); updateIcmpSequenceNo(); - // check if suspend timer is running - if ((!mSuspendTx) && (!mShutdownTx)) { + if (forceSend || ((!mSuspendTx) && (!mShutdownTx))) { boost::system::error_code errorCode; mStream.write_some(boost::asio::buffer(mTxBuffer.data(), mTxPacketSize), errorCode); diff --git a/src/link_prober/LinkProber.h b/src/link_prober/LinkProber.h index 4d9eb528..d01c0a8a 100644 --- a/src/link_prober/LinkProber.h +++ b/src/link_prober/LinkProber.h @@ -167,6 +167,15 @@ class LinkProber */ void probePeerTor(); + /** + *@method detectLink + * + *@brief detect link status + * + *@return none + */ + void detectLink(); + /** *@method sendPeerSwitchCommand * @@ -266,9 +275,11 @@ class LinkProber * *@brief send ICMP ECHOREQUEST packet * + *@param forceSend (in) Force sending heartbeat, used in link detect only + * *@return none */ - void sendHeartbeat(); + void sendHeartbeat(bool forceSend = false); /** *@method handleTlvCommandRecv diff --git a/test/FakeLinkProber.cpp b/test/FakeLinkProber.cpp index 33523be9..0d05fc83 100644 --- a/test/FakeLinkProber.cpp +++ b/test/FakeLinkProber.cpp @@ -106,6 +106,13 @@ void FakeLinkProber::probePeerTor() mProbePeerTorCallCount++; } +void FakeLinkProber::detectLink() +{ + MUXLOGINFO(""); + + mDetectLinkCallCount++; +} + void FakeLinkProber::suspendTxProbes(uint32_t suspendTime_msec) { MUXLOGINFO(""); diff --git a/test/FakeLinkProber.h b/test/FakeLinkProber.h index 2879d897..fd181421 100644 --- a/test/FakeLinkProber.h +++ b/test/FakeLinkProber.h @@ -43,6 +43,7 @@ class FakeLinkProber void startProbing(); void updateEthernetFrame(); void probePeerTor(); + void detectLink(); void suspendTxProbes(uint32_t suspendTime_msec); void resumeTxProbes(); void sendPeerSwitchCommand(); @@ -62,6 +63,7 @@ class FakeLinkProber uint32_t mUpdateEthernetFrameCallCount = 0; uint32_t mProbePeerTorCallCount = 0; uint32_t mSuspendTxProbeCallCount = 0; + uint32_t mDetectLinkCallCount = 0; uint32_t mResumeTxProbeCallCount = 0; uint32_t mSendPeerSwitchCommand = 0; uint32_t mSendPeerProbeCommand = 0; diff --git a/test/FakeMuxPort.cpp b/test/FakeMuxPort.cpp index a32d2b9b..7757003f 100644 --- a/test/FakeMuxPort.cpp +++ b/test/FakeMuxPort.cpp @@ -117,6 +117,9 @@ inline void FakeMuxPort::initLinkProberActiveStandby() getActiveStandbyStateMachinePtr()->setProbePeerTorFnPtr( boost::bind(&FakeLinkProber::probePeerTor, mFakeLinkProber.get()) ); + getActiveStandbyStateMachinePtr()->setDetectLinkFnPtr( + boost::bind(&FakeLinkProber::detectLink, mFakeLinkProber.get()) + ); getActiveStandbyStateMachinePtr()->setSuspendTxFnPtr( boost::bind(&FakeLinkProber::suspendTxProbes, mFakeLinkProber.get(), boost::placeholders::_1) ); diff --git a/test/LinkManagerStateMachineTest.cpp b/test/LinkManagerStateMachineTest.cpp index 401eb0af..c954c4a8 100644 --- a/test/LinkManagerStateMachineTest.cpp +++ b/test/LinkManagerStateMachineTest.cpp @@ -1494,4 +1494,39 @@ TEST_F(LinkManagerStateMachineTest, OrchagentRollback) EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 2); } +TEST_F(LinkManagerStateMachineTest, ProbeLinkInSuspendTimeout) +{ + setMuxActive(); + VALIDATE_STATE(Active, Active, Up); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 0); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 0); + + postLinkProberEvent(link_prober::LinkProberState::Unknown); + VALIDATE_STATE(Unknown, Active, Up); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 0); + + postSuspendTimerExpiredEvent(2); + VALIDATE_STATE(Unknown, Wait, Up); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1); + + // peer doesn't take the link + handleProbeMuxState("active", 3); + VALIDATE_STATE(Unknown, Active, Up); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 2); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1); + + // link resumes, take back the link and resume heartbeat + postLinkProberEvent(link_prober::LinkProberState::Active); + VALIDATE_STATE(Active, Active, Up); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 2); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 2); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1); +} + } /* namespace test */