Skip to content

Commit

Permalink
[active-standby] Probe the link in suspend timeout (sonic-net#235)
Browse files Browse the repository at this point in the history
Approach
What is the motivation for this PR?
Fix the issue:
After the link probe unknown, the standby ToR fails to take the link.
When the link recovers, the active ToR is stuck in heartbeat suspension.

Work item tracking
Microsoft ADO (number only): 26281338
How did you do it?
Let's probe the link in suspension timeout if current state is (unknown, active, up).
Kudos to @zjswhhh to point out we should force sending the heartbeat in link detect.

How did you verify/test it?
UT and verify on testbed.

Any platform specific information?
  • Loading branch information
lolyu authored Jan 17, 2024
1 parent 6586abf commit 74c33ea
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 6 deletions.
6 changes: 6 additions & 0 deletions src/link_manager/LinkManagerStateMachineActiveStandby.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,9 @@ void ActiveStandbyStateMachine::handleSwssBladeIpv4AddressUpdate(boost::asio::ip
mProbePeerTorFnPtr = boost::bind(
&link_prober::LinkProber::probePeerTor, mLinkProberPtr.get()
);
mDetectLinkFnPtr = boost::bind(
&link_prober::LinkProber::detectLink, mLinkProberPtr.get()
);
mSuspendTxFnPtr = boost::bind(
&link_prober::LinkProber::suspendTxProbes, mLinkProberPtr.get(), boost::placeholders::_1
);
Expand Down Expand Up @@ -490,6 +493,7 @@ void ActiveStandbyStateMachine::handleStateChange(LinkProberEvent &event, link_p

if (ps(mCompositeState) != link_prober::LinkProberState::Unknown) {
mResumeTxFnPtr();
mUnknownActiveUpBackoffFactor = 1;
}

updateMuxLinkmgrState();
Expand Down Expand Up @@ -829,6 +833,8 @@ void ActiveStandbyStateMachine::handleSuspendTimerExpiry()
CompositeState currState = mCompositeState;
enterMuxWaitState(mCompositeState);
LOGINFO_MUX_STATE_TRANSITION(mMuxPortConfig.getPortName(), currState, mCompositeState);
// detect if link/server recovers
mDetectLinkFnPtr();
} else {
mUnknownActiveUpBackoffFactor = 1;
}
Expand Down
12 changes: 12 additions & 0 deletions src/link_manager/LinkManagerStateMachineActiveStandby.h
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,17 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase,
*/
void setProbePeerTorFnPtr(boost::function<void ()> probePeerTorFnPtr) {mProbePeerTorFnPtr = probePeerTorFnPtr;};

/**
*@method setDetectLinkFnPtr
*
*@brief set new DetectLinkFnPtr for the state machine. This method is used for testing
*
*@param detectLinkFnPtr (in) pointer to new DetectLinkFnPtr
*
*@return none
*/
void setDetectLinkFnPtr(boost::function<void ()> detectLinkFnPtr) {mDetectLinkFnPtr = detectLinkFnPtr;};

/**
*@method setSuspendTxFnPtr
*
Expand Down Expand Up @@ -852,6 +863,7 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase,
boost::function<void ()> mInitializeProberFnPtr;
boost::function<void ()> mStartProbingFnPtr;
boost::function<void ()> mProbePeerTorFnPtr;
boost::function<void ()> mDetectLinkFnPtr;
boost::function<void (uint32_t suspendTime_msec)> mSuspendTxFnPtr;
boost::function<void ()> mResumeTxFnPtr;
boost::function<void ()> mSendPeerSwitchCommandFnPtr;
Expand Down
23 changes: 18 additions & 5 deletions src/link_prober/LinkProber.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,21 @@ void LinkProber::updateEthernetFrame()
void LinkProber::probePeerTor()
{
boost::asio::io_service &ioService = mStrand.context();
ioService.post(mStrand.wrap(boost::bind(&LinkProber::sendHeartbeat, this)));
ioService.post(mStrand.wrap(boost::bind(&LinkProber::sendHeartbeat, this, false)));
}

//
// ---> detectLink();
//
// send HBs to detect the link status
//
void LinkProber::detectLink()
{
boost::asio::io_service &ioService = mStrand.context();
for (uint32_t i = 0; i < mMuxPortConfig.getPositiveStateChangeRetryCount(); ++i)
{
ioService.post(mStrand.wrap(boost::bind(&LinkProber::sendHeartbeat, this, true)));
}
}

//
Expand Down Expand Up @@ -304,18 +318,17 @@ void LinkProber::handleSendProbeCommand()
}

//
// ---> sendHeartbeat()
// ---> sendHeartbeat(bool forceSend)
//
// send ICMP ECHOREQUEST packet
//
void LinkProber::sendHeartbeat()
void LinkProber::sendHeartbeat(bool forceSend)
{
MUXLOGTRACE(mMuxPortConfig.getPortName());

updateIcmpSequenceNo();

// check if suspend timer is running
if ((!mSuspendTx) && (!mShutdownTx)) {
if (forceSend || ((!mSuspendTx) && (!mShutdownTx))) {
boost::system::error_code errorCode;
mStream.write_some(boost::asio::buffer(mTxBuffer.data(), mTxPacketSize), errorCode);

Expand Down
13 changes: 12 additions & 1 deletion src/link_prober/LinkProber.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,15 @@ class LinkProber
*/
void probePeerTor();

/**
*@method detectLink
*
*@brief detect link status
*
*@return none
*/
void detectLink();

/**
*@method sendPeerSwitchCommand
*
Expand Down Expand Up @@ -266,9 +275,11 @@ class LinkProber
*
*@brief send ICMP ECHOREQUEST packet
*
*@param forceSend (in) Force sending heartbeat, used in link detect only
*
*@return none
*/
void sendHeartbeat();
void sendHeartbeat(bool forceSend = false);

/**
*@method handleTlvCommandRecv
Expand Down
7 changes: 7 additions & 0 deletions test/FakeLinkProber.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ void FakeLinkProber::probePeerTor()
mProbePeerTorCallCount++;
}

void FakeLinkProber::detectLink()
{
MUXLOGINFO("");

mDetectLinkCallCount++;
}

void FakeLinkProber::suspendTxProbes(uint32_t suspendTime_msec)
{
MUXLOGINFO("");
Expand Down
2 changes: 2 additions & 0 deletions test/FakeLinkProber.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class FakeLinkProber
void startProbing();
void updateEthernetFrame();
void probePeerTor();
void detectLink();
void suspendTxProbes(uint32_t suspendTime_msec);
void resumeTxProbes();
void sendPeerSwitchCommand();
Expand All @@ -62,6 +63,7 @@ class FakeLinkProber
uint32_t mUpdateEthernetFrameCallCount = 0;
uint32_t mProbePeerTorCallCount = 0;
uint32_t mSuspendTxProbeCallCount = 0;
uint32_t mDetectLinkCallCount = 0;
uint32_t mResumeTxProbeCallCount = 0;
uint32_t mSendPeerSwitchCommand = 0;
uint32_t mSendPeerProbeCommand = 0;
Expand Down
3 changes: 3 additions & 0 deletions test/FakeMuxPort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ inline void FakeMuxPort::initLinkProberActiveStandby()
getActiveStandbyStateMachinePtr()->setProbePeerTorFnPtr(
boost::bind(&FakeLinkProber::probePeerTor, mFakeLinkProber.get())
);
getActiveStandbyStateMachinePtr()->setDetectLinkFnPtr(
boost::bind(&FakeLinkProber::detectLink, mFakeLinkProber.get())
);
getActiveStandbyStateMachinePtr()->setSuspendTxFnPtr(
boost::bind(&FakeLinkProber::suspendTxProbes, mFakeLinkProber.get(), boost::placeholders::_1)
);
Expand Down
35 changes: 35 additions & 0 deletions test/LinkManagerStateMachineTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1494,4 +1494,39 @@ TEST_F(LinkManagerStateMachineTest, OrchagentRollback)
EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 2);
}

TEST_F(LinkManagerStateMachineTest, ProbeLinkInSuspendTimeout)
{
setMuxActive();
VALIDATE_STATE(Active, Active, Up);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 0);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 0);

postLinkProberEvent(link_prober::LinkProberState::Unknown);
VALIDATE_STATE(Unknown, Active, Up);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 0);

postSuspendTimerExpiredEvent(2);
VALIDATE_STATE(Unknown, Wait, Up);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1);

// peer doesn't take the link
handleProbeMuxState("active", 3);
VALIDATE_STATE(Unknown, Active, Up);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 2);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1);

// link resumes, take back the link and resume heartbeat
postLinkProberEvent(link_prober::LinkProberState::Active);
VALIDATE_STATE(Active, Active, Up);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mSuspendTxProbeCallCount, 2);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mResumeTxProbeCallCount, 2);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1);
}

} /* namespace test */

0 comments on commit 74c33ea

Please sign in to comment.