Skip to content

Commit 6c0fea8

Browse files
milan-zededaOhmSpectator
authored andcommitted
Avoid endless loop in DPCManager.runVerify when we get RemoteTemporaryFailure
This commit fixes a bug in DpcManager.runVerify that would cause an infinite loop when a RemoteTemporaryFailure was returned by the ConnectivityTester. The issue was that DPCManager didn’t flag the DPC as tested when a RemoteTemporaryFailure was received. Such a DPC was then mistakenly evaluated as a new DPC received during testing, and the entire testing procedure was repeated—resulting in an endless cycle. This commit fixes the issue by flagging the DPC as tested, and actually as successful, reporting the RemoteTemporaryFailure only as a warning, since connectivity itself is working. There is no need to handle testing retries for RemoteTemporaryFailure explicitly, as they are already performed periodically every 5 minutes (timer.port.testinterval), even when the latest DPC is marked as successful. Signed-off-by: Milan Lenco <[email protected]> (cherry picked from commit 07215ab)
1 parent 138416b commit 6c0fea8

File tree

2 files changed

+60
-3
lines changed

2 files changed

+60
-3
lines changed

pkg/pillar/dpcmanager/dpcmanager_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,3 +1823,59 @@ func TestOldDPC(test *testing.T) {
18231823
eth0Dhcpcd := dg.Reference(generic.Dhcpcd{AdapterIfName: "eth0"})
18241824
t.Expect(itemIsCreated(eth0Dhcpcd)).To(BeTrue())
18251825
}
1826+
1827+
func TestRemoteTemporaryFailure(test *testing.T) {
1828+
t := initTest(test)
1829+
1830+
// Prepare simulated network stack.
1831+
eth0 := mockEth0()
1832+
networkMonitor.AddOrUpdateInterface(eth0)
1833+
1834+
// Single interface configured for mgmt.
1835+
aa := makeAA(selectedIntfs{eth0: true})
1836+
dpcManager.UpdateAA(aa)
1837+
1838+
// Apply global config.
1839+
dpcManager.UpdateGCP(globalConfig())
1840+
1841+
// Apply initial "bootstrap" DPC with single ethernet port.
1842+
timePrio1 := time.Now()
1843+
dpc := makeDPC("bootstrap", timePrio1, selectedIntfs{eth0: true})
1844+
dpcManager.AddDPC(dpc)
1845+
t.Eventually(testingInProgressCb()).Should(BeTrue())
1846+
t.Eventually(testingInProgressCb()).Should(BeFalse())
1847+
t.Eventually(dpcIdxCb()).Should(Equal(0))
1848+
t.Eventually(dnsKeyCb()).Should(Equal("bootstrap"))
1849+
t.Eventually(dpcStateCb(0)).Should(Equal(types.DPCStateSuccess))
1850+
1851+
// Apply "zedagent" DPC with single ethernet port.
1852+
timePrio2 := time.Now()
1853+
dpc = makeDPC("zedagent", timePrio2, selectedIntfs{eth0: true})
1854+
1855+
// Simulate certificate error, which is reported from ConnectivityTester to DpcManager
1856+
// as a RemoteTemporaryFailure.
1857+
connTester.SetConnectivityError("zedagent", "eth0",
1858+
&conntester.RemoteTemporaryFailure{
1859+
Endpoint: "simulated-controller",
1860+
WrappedErr: errors.New("certificate error"),
1861+
})
1862+
1863+
// Even though we are getting error from the simulated controller,
1864+
// the connectivity is working and DPC should be marked as working
1865+
// and replace the "bootstrap" DPC.
1866+
// Previously, we had a bug that would result in DPCManager being
1867+
// stuck in a loop inside runVerify, re-running verification
1868+
// for the same DPC and constantly getting RemoteTemporaryFailure.
1869+
dpcManager.AddDPC(dpc)
1870+
t.Eventually(testingInProgressCb()).Should(BeTrue())
1871+
t.Eventually(testingInProgressCb()).Should(BeFalse())
1872+
t.Eventually(dpcListLenCb()).Should(Equal(1)) // "bootstrap" compressed out
1873+
t.Eventually(dpcIdxCb()).Should(Equal(0))
1874+
t.Eventually(dnsKeyCb()).Should(Equal("zedagent"))
1875+
t.Eventually(dpcStateCb(0)).Should(Equal(types.DPCStateRemoteWait))
1876+
dpc = getDPC(0)
1877+
t.Expect(dpc.HasError()).To(BeFalse())
1878+
t.Expect(dpc.HasWarning()).To(BeTrue())
1879+
t.Expect(dpc.LastWarning).To(Equal(
1880+
"Remote temporary failure (endpoint: simulated-controller): certificate error"))
1881+
}

pkg/pillar/dpcmanager/verify.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ func (m *DpcManager) verifyDPC(ctx context.Context) (status types.DPCState) {
251251
switch dpc.State {
252252
case types.DPCStateFail, types.DPCStateFailWithIPAndDNS:
253253
cloudConnWorks = false
254-
case types.DPCStateSuccess:
254+
case types.DPCStateSuccess, types.DPCStateRemoteWait:
255255
cloudConnWorks = true
256256
default:
257257
// DpcManager is waiting for something (IP address, DNS server, etc.)
@@ -277,8 +277,9 @@ func (m *DpcManager) verifyDPC(ctx context.Context) (status types.DPCState) {
277277
_, rtf := err.(*conntester.RemoteTemporaryFailure)
278278
if rtf {
279279
m.Log.Errorf("DPC verify: remoteTemporaryFailure: %v", err)
280-
// NOTE: We retry until the certificate or ECONNREFUSED is fixed
281-
// on the server side.
280+
// We treat RemoteTemporaryFailure as a success because the connectivity is working.
281+
// Save and publish the server-side error only as a warning.
282+
dpc.RecordSuccessWithWarning(err.Error())
282283
status = types.DPCStateRemoteWait
283284
dpc.State = status
284285
return status

0 commit comments

Comments
 (0)