Skip to content

Commit 9edf670

Browse files
authored
Don't update machine status to terminating on deletion failure (#995)
1 parent f46b017 commit 9edf670

File tree

5 files changed

+60
-4
lines changed

5 files changed

+60
-4
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ See [here](docs/documents/apis.md) for CRD API Documentation
100100
<td>There is a Safety Controller responsible for handling the unidentified or unknown behaviours from the cloud providers. Safety Controller:
101101
<ul>
102102
<li>
103-
freezes the MachineDeployment controller and MachineSet controller if the number of <code>Machine</code> objects goes beyond a certain threshold on top of <code>Spec.replicas</code>. It can be configured by the flag <code>--safety-up</code> or <code>--safety-down</code> and also <code>--machine-safety-overshooting-period`</code>.
103+
freezes the MachineDeployment controller and MachineSet controller if the number of <code>Machine</code> objects goes beyond a certain threshold on top of <code>Spec.replicas</code>. It can be configured by the flag <code>--safety-up</code> or <code>--safety-down</code> and also <code>--machine-safety-overshooting-period</code>.
104104
</li>
105105
<li>
106106
freezes the functionality of the MCM if either of the <code>target-apiserver</code> or the <code>control-apiserver</code> is not reachable.

pkg/controller/controller_suite_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,7 @@ func createController(
465465

466466
controller.machineControl = FakeMachineControl{
467467
controlMachineClient: fakeTypedMachineClient,
468+
Recorder: controller.recorder,
468469
}
469470

470471
return controller, fakeObjectTrackers

pkg/controller/machineset.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ func (c *controller) reconcileClusterMachineSet(key string) error {
528528
return err
529529
}
530530
}
531-
klog.V(3).Infof("Processing the machineset %q with replicas %d associated with machine class: %q", machineSet.Name, machineSet.Spec.Replicas, machineSet.Spec.MachineClass.Name)
531+
klog.V(3).Infof("Processing the machineset %q with replicas %d associated with machine class: %q", machineSet.Name, machineSet.Spec.Replicas, machineSet.Spec.Template.Spec.Class.Name)
532532

533533
selector, err := metav1.LabelSelectorAsSelector(machineSet.Spec.Selector)
534534
if err != nil {
@@ -713,6 +713,7 @@ func (c *controller) prepareMachineForDeletion(ctx context.Context, targetMachin
713713
klog.V(2).Infof("Failed to delete %v, decrementing expectations for %v %s/%s", machineKey, machineSet.Kind, machineSet.Namespace, machineSet.Name)
714714
c.expectations.DeletionObserved(machineSetKey, machineKey)
715715
errCh <- err
716+
return
716717
} else {
717718
// successful delete of a Failed phase machine due to unhealthiness for too long, increments staleMachinesRemoved counter
718719
// note: call is blocking and thread safe as other worker threads might be updating the counter as well
@@ -733,7 +734,7 @@ func (c *controller) prepareMachineForDeletion(ctx context.Context, targetMachin
733734
TimeoutActive: false,
734735
LastUpdateTime: metav1.Now(),
735736
}
736-
if _, err := c.updateMachineStatus(ctx, targetMachine, lastOperation, currentStatus); err != nil {
737+
if _, err := c.updateMachineStatus(ctx, targetMachine, lastOperation, currentStatus); err != nil && !apierrors.IsNotFound(err) {
737738
// TODO: proper error handling needs to happen here
738739
klog.Errorf("failed to update machine status for machine %s: %v", targetMachine.Name, err)
739740
}

pkg/controller/machineset_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package controller
77
import (
88
"context"
99
"errors"
10+
"fmt"
1011
"sync"
1112
"time"
1213

@@ -15,9 +16,11 @@ import (
1516
k8sError "k8s.io/apimachinery/pkg/api/errors"
1617
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1718
"k8s.io/apimachinery/pkg/runtime"
19+
"k8s.io/client-go/testing"
1820
"k8s.io/utils/pointer"
1921

2022
machinev1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
23+
faketyped "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1/fake"
2124
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
2225
)
2326

@@ -1497,6 +1500,7 @@ var _ = Describe("machineset", func() {
14971500
testMachineSet *machinev1.MachineSet
14981501
testFailedMachine2 *machinev1.Machine
14991502
testFailedMachine1 *machinev1.Machine
1503+
testRunningMachine *machinev1.Machine
15001504
)
15011505

15021506
BeforeEach(func() {
@@ -1558,6 +1562,22 @@ var _ = Describe("machineset", func() {
15581562
},
15591563
},
15601564
}
1565+
1566+
testRunningMachine = &machinev1.Machine{
1567+
ObjectMeta: metav1.ObjectMeta{
1568+
Name: "machine-t",
1569+
Namespace: testNamespace,
1570+
UID: "1234560",
1571+
Labels: map[string]string{
1572+
"test-label": "test-label",
1573+
},
1574+
},
1575+
Status: machinev1.MachineStatus{
1576+
CurrentStatus: machinev1.CurrentStatus{
1577+
Phase: MachineRunning,
1578+
},
1579+
},
1580+
}
15611581
})
15621582

15631583
// Testcase: It should delete the inactive machines.
@@ -1582,6 +1602,40 @@ var _ = Describe("machineset", func() {
15821602
Expect(Err1).Should(Not(BeNil()))
15831603
Expect(Err2).Should(Not(BeNil()))
15841604
})
1605+
1606+
It("It should not mark a machine as terminating when deletion fails.", func() {
1607+
stop := make(chan struct{})
1608+
defer close(stop)
1609+
1610+
objects := []runtime.Object{}
1611+
objects = append(objects, testMachineSet, testFailedMachine1, testRunningMachine)
1612+
c, trackers := createController(stop, testNamespace, objects, nil, nil)
1613+
defer trackers.Stop()
1614+
waitForCacheSync(stop, c)
1615+
1616+
// Ref: https://estebangarcia.io/unit-testing-k8s-golang/
1617+
// Add a reactor that intercepts machine delete call and returns an error
1618+
// to simulate error when processing deletion request for a machine
1619+
machineDeletionError := "forced machine deletion error"
1620+
c.controlMachineClient.(*faketyped.FakeMachineV1alpha1).Fake.PrependReactor("delete", "machines", func(_ testing.Action) (handled bool, ret runtime.Object, err error) {
1621+
return true, &machinev1.Machine{}, errors.New(machineDeletionError)
1622+
})
1623+
targetMachines := []*machinev1.Machine{testFailedMachine1, testRunningMachine}
1624+
err := c.terminateMachines(context.TODO(), targetMachines, testMachineSet)
1625+
1626+
waitForCacheSync(stop, c)
1627+
1628+
mFailed, Err1 := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), targetMachines[0].Name, metav1.GetOptions{})
1629+
mRunning, Err2 := c.controlMachineClient.Machines(testNamespace).Get(context.TODO(), targetMachines[1].Name, metav1.GetOptions{})
1630+
1631+
Expect(err).To(Equal(fmt.Errorf("unable to delete machines: %s", machineDeletionError)))
1632+
Expect(Err1).Should(BeNil())
1633+
Expect(mFailed.ObjectMeta.DeletionTimestamp).Should(BeNil())
1634+
Expect(mFailed.Status.CurrentStatus.Phase).NotTo(Equal(machinev1.MachineTerminating))
1635+
Expect(Err2).Should(BeNil())
1636+
Expect(mRunning.ObjectMeta.DeletionTimestamp).Should(BeNil())
1637+
Expect(mRunning.Status.CurrentStatus.Phase).NotTo(Equal(machinev1.MachineTerminating))
1638+
})
15851639
})
15861640

15871641
Describe("#addMachineSetFinalizers", func() {

pkg/util/provider/machinecontroller/machine.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque
539539
nodeName = createMachineResponse.NodeName
540540
providerID = createMachineResponse.ProviderID
541541
// Creation was successful
542-
klog.V(2).Infof("Created new VM for machine: %q with ProviderID: %q and backing node: %q", machine.Name, providerID, getNodeName(machine))
542+
klog.V(2).Infof("Created new VM for machine: %q with ProviderID: %q and backing node: %q", machine.Name, providerID, nodeName)
543543
// If a node obj already exists by the same nodeName, treat it as a stale node and trigger machine deletion.
544544
// TODO: there is a case with Azure where the VM may join the cluster before the CreateMachine call is completed,
545545
// and that would make an otherwise healthy node, be marked as stale.

0 commit comments

Comments
 (0)