Skip to content

Commit 8e53757

Browse files
committed
Put device in maintenance mode if no edge node certs
After a device has been reinstalled but reusing the the device certificate the attempts to publish EdgeNodeCerts will be rejected by the controller for security reasons since they have changed. (They are stored in /persist/certs hence are lost and recreated on a device reinstall.) This makes that unusual condition visible to the user by putting the device in maintenance mode. Signed-off-by: eriknordmark <[email protected]>
1 parent c24b01b commit 8e53757

File tree

5 files changed

+50
-8
lines changed

5 files changed

+50
-8
lines changed

pkg/pillar/cmd/nodeagent/nodeagent.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,17 @@ func handleZedAgentStatusImpl(ctxArg interface{}, key string,
502502
handleDeviceCmd(ctxPtr, status, types.DeviceOperationPoweroff)
503503
}
504504
updateZedagentCloudConnectStatus(ctxPtr, status)
505+
if status.EdgeNodeCertsRefused {
506+
addMaintenanceModeReason(ctxPtr,
507+
types.MaintenanceModeReasonEdgeNodeCertsRefused,
508+
"handleZedAgentStatusImpl")
509+
publishNodeAgentStatus(ctxPtr)
510+
} else {
511+
removeMaintenanceModeReason(ctxPtr,
512+
types.MaintenanceModeReasonEdgeNodeCertsRefused,
513+
"handleZedAgentStatusImpl")
514+
publishNodeAgentStatus(ctxPtr)
515+
}
505516
log.Functionf("handleZedAgentStatusImpl(%s) done", key)
506517
}
507518

pkg/pillar/cmd/zedagent/handlecertconfig.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,10 @@ func edgeNodeCertsTask(ctx *zedagentContext, triggerEdgeNodeCerts chan struct{})
454454
}
455455

456456
// prepare the edge node certs list proto message
457+
// Note that this can be rejected by the controller if a cert has changed
458+
// (which can happen if /persist, hence /persist/certs, have been destroyed
459+
// and recreated). In that case we set edgeNodeCertsRefused which will make
460+
// nodeagent put this device in maintenance mode.
457461
func publishEdgeNodeCertsToController(ctx *zedagentContext) {
458462
var attestReq = &attest.ZAttestReq{}
459463

pkg/pillar/cmd/zedagent/handleconfig.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,7 @@ func publishZedAgentStatus(getconfigCtx *getconfigContext) {
11071107
RequestedBootReason: ctx.requestedBootReason,
11081108
MaintenanceMode: ctx.maintenanceMode,
11091109
MaintenanceModeReasons: ctx.maintModeReasons,
1110+
EdgeNodeCertsRefused: ctx.edgeNodeCertsRefused,
11101111
ForceFallbackCounter: ctx.forceFallbackCounter,
11111112
CurrentProfile: getconfigCtx.localCmdAgent.GetCurrentProfile(),
11121113
RadioSilence: getconfigCtx.localCmdAgent.GetRadioSilenceConfig(),

pkg/pillar/cmd/zedagent/zedagent.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ type zedagentContext struct {
230230
apiMaintenanceMode bool
231231
localMaintenanceMode bool //maintenance mode triggered by local failure
232232
localMaintModeReasons types.MaintenanceModeMultiReason //local failure reason for maintenance mode
233+
edgeNodeCertsRefused bool // causes maintenance mode
233234
devState types.DeviceState
234235
attestState types.AttestState
235236
attestError string
@@ -2781,7 +2782,11 @@ func getDeferredSentHandlerFunction(ctx *zedagentContext) controllerconn.SentHan
27812782
}
27822783
if el, ok := itemType.(attest.ZAttestReqType); ok && el == attest.ZAttestReqType_ATTEST_REQ_CERT {
27832784
log.Noticef("sendAttestReqProtobuf: Sent EdgeNodeCerts")
2784-
ctx.publishedEdgeNodeCerts = true
2785+
if !ctx.publishedEdgeNodeCerts {
2786+
ctx.publishedEdgeNodeCerts = true
2787+
ctx.edgeNodeCertsRefused = false
2788+
publishZedAgentStatus(ctx.getconfigCtx)
2789+
}
27852790
}
27862791
} else {
27872792
if el, ok := itemType.(attest.ZAttestReqType); ok {
@@ -2801,9 +2806,26 @@ func getDeferredSentHandlerFunction(ctx *zedagentContext) controllerconn.SentHan
28012806
if el == attest.ZAttestReqType_ATTEST_REQ_CERT {
28022807
log.Warnf("sendAttestReqProtobuf: Failed to send EdgeNodeCerts: %s",
28032808
result.String())
2804-
// XXX should we declare maintenance mode?
28052809
// We get SenderStatusNotFound when a cert can
28062810
// not be replaced in the controller for security reasons.
2811+
if !ctx.publishedEdgeNodeCerts &&
2812+
ctx.getconfigCtx.configGetStatus == types.ConfigGetSuccess {
2813+
// Force maintenance mode
2814+
// Note: If the network is flaky we might set it early
2815+
// when we have ConfigGetSuccess and then clear it once the certs published
2816+
log.Errorf("Failed to send EdgeNodeCerts: %s, maint %t, %v",
2817+
result.String(),
2818+
ctx.maintenanceMode,
2819+
ctx.maintModeReasons)
2820+
ctx.edgeNodeCertsRefused = true
2821+
publishZedAgentStatus(ctx.getconfigCtx)
2822+
} else if ctx.getconfigCtx.configGetStatus != types.ConfigGetSuccess {
2823+
// Lost controller connectivity
2824+
// Currently no need for maintenance mode - it will
2825+
// be determined when connectivity is back
2826+
ctx.edgeNodeCertsRefused = false
2827+
publishZedAgentStatus(ctx.getconfigCtx)
2828+
}
28072829
}
28082830
if !ctx.publishedEdgeNodeCerts {
28092831
// Attestation request does not clog the send queue (issued

pkg/pillar/types/zedagenttypes.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -402,12 +402,13 @@ type MaintenanceModeMultiReason []MaintenanceModeReason
402402
// MaintenanceModeReason codes for storing reason for getting into maintenance mode,
403403
// this should match the values in api/proto/info/info.proto.MaintenanceModeReason
404404
const (
405-
MaintenanceModeReasonNone = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_NONE)
406-
MaintenanceModeReasonUserRequested = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_USER_REQUESTED)
407-
MaintenanceModeReasonVaultLockedUp = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_VAULT_LOCKED_UP)
408-
MaintenanceModeReasonNoDiskSpace = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_LOW_DISK_SPACE)
409-
MaintenanceModeReasonTpmEncFailure = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_TPM_ENCRYPTION_FAILURE)
410-
MaintenanceModeReasonTpmQuoteFailure = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_TPM_QUOTE_FAILURE)
405+
MaintenanceModeReasonNone = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_NONE)
406+
MaintenanceModeReasonUserRequested = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_USER_REQUESTED)
407+
MaintenanceModeReasonVaultLockedUp = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_VAULT_LOCKED_UP)
408+
MaintenanceModeReasonNoDiskSpace = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_LOW_DISK_SPACE)
409+
MaintenanceModeReasonTpmEncFailure = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_TPM_ENCRYPTION_FAILURE)
410+
MaintenanceModeReasonTpmQuoteFailure = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_TPM_QUOTE_FAILURE)
411+
MaintenanceModeReasonEdgeNodeCertsRefused = MaintenanceModeReason(info.MaintenanceModeReason_MAINTENANCE_MODE_REASON_EDGE_NODE_CERTS_REFUSED)
411412
)
412413

413414
// String returns the verbose equivalent of MaintenanceModeMultiReason code
@@ -441,6 +442,8 @@ func (mmr MaintenanceModeReason) String() string {
441442
return "MaintenanceModeReasonNoDiskSpace"
442443
case MaintenanceModeReasonTpmEncFailure:
443444
return "MaintenanceModeReasonTpmEncFailure"
445+
case MaintenanceModeReasonEdgeNodeCertsRefused:
446+
return "MaintenanceModeReasonEdgeNodeCertsRefused"
444447
default:
445448
return "Unknown MaintenanceModeReason"
446449
}
@@ -559,6 +562,7 @@ type ZedAgentStatus struct {
559562
RequestedBootReason BootReason // Why we will reboot
560563
MaintenanceMode bool // Don't run apps etc
561564
MaintenanceModeReasons MaintenanceModeMultiReason
565+
EdgeNodeCertsRefused bool // Causes maintenance mode
562566
ForceFallbackCounter int // Try image fallback when counter changes
563567
CurrentProfile string // Current profile
564568
RadioSilence RadioSilence // Currently requested state of radio devices

0 commit comments

Comments
 (0)