@@ -131,13 +131,13 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
131131 // trigger reboot only for nodes which are in UpgradeStarted but haven't rebooted yet
132132 if nodeObj .Status .NodeInfo .BootID == moduleStatus .BootId {
133133 log .FromContext (ctx ).Info (fmt .Sprintf ("Node: %v: Reboot is required for driver upgrade, triggering node reboot" , nodeName ))
134- n .helper .handleNodeReboot (ctx , nodeObj , deviceConfig )
134+ n .helper .handleNodeReboot (ctx , nodeObj , * deviceConfig )
135135 // for nodes which are in UpgradeStarted but already rebooted. Schedule the reboot pod deletion
136136 } else {
137137 currentBootID := nodeObj .Status .NodeInfo .BootID
138138 n .helper .setBootID (nodeObj .Name , currentBootID )
139139 log .FromContext (ctx ).Info (fmt .Sprintf ("Node: %v: Node already rebooted, scheduling reboot pod deletion" , nodeName ))
140- go n .helper .deleteRebootPod (ctx , nodeName , deviceConfig , false , deviceConfig . Generation )
140+ go n .helper .deleteRebootPod (ctx , nodeName , * deviceConfig , false )
141141 }
142142 }
143143 } else {
@@ -155,7 +155,7 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
155155 n .helper .setNodeStatus (ctx , nodeName , amdv1alpha1 .UpgradeStateInProgress )
156156 } else {
157157 n .helper .setNodeStatus (ctx , nodeName , moduleStatus .Status )
158- go n .helper .deleteRebootPod (ctx , nodeName , deviceConfig , false , deviceConfig . Generation )
158+ go n .helper .deleteRebootPod (ctx , nodeName , * deviceConfig , false )
159159 }
160160 } else {
161161 n .helper .setNodeStatus (ctx , nodeName , moduleStatus .Status )
@@ -276,7 +276,7 @@ func (n *upgradeMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1
276276 if err := n .helper .cordonOrUncordonNode (ctx , deviceConfig , & nodeList .Items [i ], false ); err != nil {
277277 log .FromContext (ctx ).Error (err , fmt .Sprintf ("Taint Removal failed for %v during deviceconfig delete:%v" , & nodeList .Items [i ].Name , err ))
278278 }
279- n .helper .deleteRebootPod (ctx , nodeList .Items [i ].Name , deviceConfig , true , deviceConfig . Generation )
279+ n .helper .deleteRebootPod (ctx , nodeList .Items [i ].Name , * deviceConfig , true )
280280 }
281281 n .helper .clearNodeStatus ()
282282 return
@@ -322,8 +322,8 @@ type upgradeMgrHelperAPI interface {
322322 getPodsToDrainOrDelete (ctx context.Context , deviceConfig * amdv1alpha1.DeviceConfig , node * v1.Node ) (newPods []v1.Pod , err error )
323323 deleteOrDrainPods (ctx context.Context , deviceConfig * amdv1alpha1.DeviceConfig , node * v1.Node ) error
324324 updateModuleVersionOnNode (ctx context.Context , deviceConfig * amdv1alpha1.DeviceConfig , node * v1.Node ) error
325- handleNodeReboot (ctx context.Context , node * v1.Node , dc * amdv1alpha1.DeviceConfig )
326- deleteRebootPod (ctx context.Context , nodeName string , dc * amdv1alpha1.DeviceConfig , force bool , genId int64 )
325+ handleNodeReboot (ctx context.Context , node * v1.Node , dc amdv1alpha1.DeviceConfig )
326+ deleteRebootPod (ctx context.Context , nodeName string , dc amdv1alpha1.DeviceConfig , force bool )
327327 getRebootPod (nodeName string , dc * amdv1alpha1.DeviceConfig ) * v1.Pod
328328
329329 // getters and setters
@@ -817,7 +817,7 @@ func (h *upgradeMgrHelper) handleNodeUpgrade(ctx context.Context, deviceConfig a
817817
818818 // Reboot the node if required
819819 if deviceConfig .Spec .Driver .UpgradePolicy .RebootRequired != nil && * deviceConfig .Spec .Driver .UpgradePolicy .RebootRequired {
820- h .handleNodeReboot (ctx , & node , & deviceConfig )
820+ h .handleNodeReboot (ctx , & node , deviceConfig )
821821 } else {
822822 // Update expected module version on the node
823823 if err := h .updateModuleVersionOnNode (ctx , & deviceConfig , & node ); err != nil {
@@ -956,9 +956,9 @@ func (h *upgradeMgrHelper) updateModuleVersionOnNode(ctx context.Context, device
956956 return nil
957957}
958958
959- func (h * upgradeMgrHelper ) handleNodeReboot (ctx context.Context , node * v1.Node , dc * amdv1alpha1.DeviceConfig ) {
959+ func (h * upgradeMgrHelper ) handleNodeReboot (ctx context.Context , node * v1.Node , dc amdv1alpha1.DeviceConfig ) {
960960 logger := log .FromContext (ctx )
961- rebootPod := h .getRebootPod (node .Name , dc )
961+ rebootPod := h .getRebootPod (node .Name , & dc )
962962 // Delete the existing pod if present
963963 pod := & v1.Pod {}
964964 if err := h .client .Get (ctx , types.NamespacedName {Namespace : dc .Namespace , Name : rebootPod .Name }, pod ); err == nil {
@@ -970,7 +970,7 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node,
970970 }
971971
972972 // Update expected module version on the node
973- if err := h .updateModuleVersionOnNode (ctx , dc , node ); err != nil {
973+ if err := h .updateModuleVersionOnNode (ctx , & dc , node ); err != nil {
974974 logger .Error (err , fmt .Sprintf ("Node: %v State: %v UpgradeFailed with Error: %v" , node .Name , h .getNodeStatus (node .Name ), err ))
975975 // Mark the state as failed
976976 h .setNodeStatus (ctx , node .Name , amdv1alpha1 .UpgradeStateFailed )
@@ -1037,15 +1037,22 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node,
10371037 // Wait for the rebootPod to get spawned
10381038 waitForRebootPod ()
10391039
1040- h .setNodeStatus (ctx , node .Name , amdv1alpha1 .UpgradeStateRebootInProgress )
1041- h .deleteRebootPod (ctx , node .Name , dc , false , dc .Generation )
1040+ fetchedDeviceConfig := & amdv1alpha1.DeviceConfig {}
1041+ if err := h .client .Get (ctx , types.NamespacedName {Namespace : dc .Namespace , Name : dc .Name }, fetchedDeviceConfig ); err != nil {
1042+ logger .Error (err , "Failed to fetch DeviceConfig from API server" )
1043+ return
1044+ }
1045+ if fetchedDeviceConfig .Spec .Driver .Version == dc .Spec .Driver .Version {
1046+ h .setNodeStatus (ctx , node .Name , amdv1alpha1 .UpgradeStateRebootInProgress )
1047+ }
1048+ h .deleteRebootPod (ctx , node .Name , dc , false )
10421049
10431050}
10441051
1045- func (h * upgradeMgrHelper ) deleteRebootPod (ctx context.Context , nodeName string , dc * amdv1alpha1.DeviceConfig , force bool , genId int64 ) {
1052+ func (h * upgradeMgrHelper ) deleteRebootPod (ctx context.Context , nodeName string , dc amdv1alpha1.DeviceConfig , force bool ) {
10461053
10471054 logger := log .FromContext (ctx )
1048- rebootPod := h .getRebootPod (nodeName , dc )
1055+ rebootPod := h .getRebootPod (nodeName , & dc )
10491056 fetchedDeviceConfig := & amdv1alpha1.DeviceConfig {}
10501057 pod := & v1.Pod {}
10511058 if err := h .client .Get (ctx , types.NamespacedName {Namespace : dc .Namespace , Name : rebootPod .Name }, pod ); err != nil {
@@ -1077,7 +1084,7 @@ func (h *upgradeMgrHelper) deleteRebootPod(ctx context.Context, nodeName string,
10771084 if err := h .client .Delete (ctx , rebootPod ); err != nil {
10781085 logger .Error (err , fmt .Sprintf ("Node: %v State: %v RebootPod Delete failed with Error: %v" , nodeName , h .getNodeStatus (nodeName ), err ))
10791086 }
1080- if fetchedDeviceConfig .Generation == genId {
1087+ if fetchedDeviceConfig .Spec . Driver . Version == dc . Spec . Driver . Version {
10811088 logger .Info ("Setting to In-Progress after deleting reboot pod" )
10821089 h .setNodeStatus (ctx , nodeName , amdv1alpha1 .UpgradeStateInProgress )
10831090 }
@@ -1100,7 +1107,7 @@ func (h *upgradeMgrHelper) deleteRebootPod(ctx context.Context, nodeName string,
11001107 logger .Error (err , "Failed to fetch DeviceConfig from API server" )
11011108 return
11021109 }
1103- if fetchedDeviceConfig .Generation == genId {
1110+ if fetchedDeviceConfig .Spec . Driver . Version == dc . Spec . Driver . Version {
11041111 logger .Info ("Setting to In-Progress after deleting reboot pod eventually" )
11051112 h .setNodeStatus (ctx , nodeName , amdv1alpha1 .UpgradeStateInProgress )
11061113 }
0 commit comments