Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
YZ775 committed Feb 13, 2025
1 parent e19fa46 commit 93c94be
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 55 deletions.
9 changes: 4 additions & 5 deletions etc/cke-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ repair:
command_retries: 2
command_interval: 30
watch_seconds: 600
- repair_command: ["sh", "neco nonGracefulNodeShutdown shutdown $1", "nonGracefulNodeShutdown"]
- repair_command: ["sh", "neco non-graceful-node-shutdown shutdown $1", "non-graceful-node-shutdown"]
command_timeout_seconds: 300
command_retries: 3
command_interval: 30
Expand All @@ -60,15 +60,14 @@ repair:
watch_seconds: 1500
health_check_command: ["sh", "-c", "check-machine-state $1 healthy retired", "check-machine-state"]
command_timeout_seconds: 10
success_command: ["sh", "-c", "neco nonGracefulNodeShutdown cleanup $1", "nonGracefulNodeShutdown"]
success_command: ["sh", "-c", "neco non-graceful-node-shutdown cleanup $1", "non-graceful-node-shutdown"]
success_timeout_seconds: 300
- operation: unreachable
repair_steps:
- repair_command: ["sh", "neco nonGracefulNodeShutdown shutdown $1", "nonGracefulNodeShutdown"]
- repair_command: ["sh", "neco non-graceful-node-shutdown shutdown $1", "non-graceful-node-shutdown"]
command_timeout_seconds: 300
command_retries: 3
command_interval: 30
need_drain: false
- repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
command_retries: 2
Expand All @@ -87,7 +86,7 @@ repair:
watch_seconds: 600
health_check_command: ["sh", "-c", "check-machine-state $1 healthy retired", "check-machine-state"]
command_timeout_seconds: 10
success_command: ["sh", "-c", "neco nonGracefulNodeShutdown cleanup $1", "nonGracefulNodeShutdown"]
success_command: ["sh", "-c", "neco non-graceful-node-shutdown cleanup $1", "non-graceful-node-shutdown"]
success_timeout_seconds: 300
max_concurrent_repairs: 1
evict_retries: 30
Expand Down
1 change: 1 addition & 0 deletions pkg/neco/cmd/non_graceful_shutdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (

var (
nonGracefulNodeShutdownConfig = "/tmp/non-graceful-node-shutdown-config"
outOfServiceTaintKey = "node.kubernetes.io/out-of-service"
)

var nonGracefulNodeShutdownCmd = &cobra.Command{
Expand Down
22 changes: 3 additions & 19 deletions pkg/neco/cmd/non_graceful_shutdown_cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@ package cmd

import (
"context"
"errors"
"fmt"
"os"
"os/signal"
"slices"
"syscall"
"time"

csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
Expand All @@ -26,15 +22,13 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
RunE: func(cmd *cobra.Command, args []string) error {
node := args[0]

ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()
ctx := context.Background()

kubeClient, err := issueAndLoadKubeconfigForNonGracefulNodeShutdown()
if err != nil {
return err
}

//get sabakan status
opt := sabakanMachinesGetOpts{}
opt.params = map[string]*string{
"ipv4": &node,
Expand All @@ -45,7 +39,6 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
}
sabakanStatus := machines[0].Status.State

// remove networkfence
cephClusters, err := listRBDCephClusters(ctx, kubeClient)
if err != nil {
return err
Expand All @@ -65,22 +58,13 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
return err
}
}
// set fence state to Unfenced
networkFence.Spec.FenceState = csiaddonsv1alpha1.Unfenced
err = kubeClient.Update(ctx, networkFence)
if err != nil {
return err
}
fmt.Printf("Unfenced NetworkFence %s\n", networkFence.Name)

// wait for unfence of networkfence to be Succeeded
fmt.Printf("Waiting for Unfence operation to be Succeeded\n")
fmt.Printf("Waiting for Unfence operation of %s to be Succeeded\n", networkFence.Name)
for {
select {
case <-ctx.Done():
return errors.New("cancelled waiting for Unfence to be Succeeded")
default:
}
err := kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, networkFence)
if err != nil {
return err
Expand Down Expand Up @@ -110,7 +94,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
return err
}
for i, taint := range kubernetesNode.Spec.Taints {
if taint.Key == "node.kubernetes.io/out-of-service" {
if taint.Key == outOfServiceTaintKey {
kubernetesNode.Spec.Taints = slices.Delete(kubernetesNode.Spec.Taints, i, i+1)
}
}
Expand Down
60 changes: 29 additions & 31 deletions pkg/neco/cmd/non_graceful_shutdown_shutdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,16 @@ package cmd
import (
"bytes"
"context"
"errors"
"fmt"
"os"
"os/exec"
"os/signal"
"strings"
"syscall"
"time"

csiaddonsv1alpha1 "github.com/csi-addons/kubernetes-csi-addons/api/csiaddons/v1alpha1"
"github.com/spf13/cobra"
"golang.org/x/sync/errgroup"
corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -28,21 +25,12 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
RunE: func(cmd *cobra.Command, args []string) error {
node := args[0]

ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()

ctx := context.Background()
kubeClient, err := issueAndLoadKubeconfigForNonGracefulNodeShutdown()
if err != nil {
return err
}

kubernetesNode := &corev1.Node{}
err = kubeClient.Get(ctx, client.ObjectKey{Name: node}, kubernetesNode)
if err != nil {
return err
}

// Shutdown the node
fmt.Printf("Shutting down the node: %s\n", node)
powerCheckCmd := exec.Command("neco", "power", "status", node)
var out bytes.Buffer
Expand All @@ -57,14 +45,8 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
if err != nil {
return err
}
//wait for the node to be down
fmt.Printf("Waiting for the node %s to be power off\n", node)
for {
select {
case <-ctx.Done():
return errors.New("cancelled waiting for the node to be power off")
default:
}
out.Reset()
powerCheckCmd := exec.Command("neco", "power", "status", node)
powerCheckCmd.Stdout = &out
Expand All @@ -80,7 +62,6 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
}
fmt.Printf("Node %s is power off\n", node)

// Create NetworkFence for ceph clusters
g := errgroup.Group{}
cephClusters, err := listRBDCephClusters(ctx, kubeClient)
if err != nil {
Expand Down Expand Up @@ -116,16 +97,9 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
return err
}
}
fmt.Printf("NetworkFence %s created\n", networkFence.Name)
// wait for fence of networkfence to be Succeeded
fmt.Println("Waiting for the fence operation to be succeeded")
fmt.Printf("Waiting for the fence operation of %s to be succeeded\n", networkFence.Name)
networkFence = csiaddonsv1alpha1.NetworkFence{}
for {
select {
case <-ctx.Done():
return errors.New("cancelled waiting for fence operation to be succeeded")
default:
}
err := kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, &networkFence)
if err != nil {
return err
Expand All @@ -146,16 +120,21 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{

// Add taint to the node
fmt.Println("Adding taint to the node")
kubernetesNode := &corev1.Node{}
err = kubeClient.Get(ctx, client.ObjectKey{Name: node}, kubernetesNode)
if err != nil {
return err
}
tainted := false
for _, taint := range kubernetesNode.Spec.Taints {
if taint.Key == "node.kubernetes.io/out-of-service" {
if taint.Key == outOfServiceTaintKey {
tainted = true
break
}
}
if !tainted {
kubernetesNode.Spec.Taints = append(kubernetesNode.Spec.Taints, corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Key: outOfServiceTaintKey,
Value: "nodeshutdown",
Effect: "NoExecute",
})
Expand All @@ -164,6 +143,25 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
return err
}
}

fmt.Println("Waiting for the VolumeAttachment to be deleted")
for {
volumeAttachmentList := &storagev1.VolumeAttachmentList{}
err = kubeClient.List(ctx, volumeAttachmentList)
if err != nil {
return err
}
volumeAttachmenCount := 0
for _, volumeAttachment := range volumeAttachmentList.Items {
if volumeAttachment.Spec.NodeName == node {
volumeAttachmenCount++
}
}
if volumeAttachmenCount == 0 {
break
}
time.Sleep(5 * time.Second)
}
fmt.Println("Non-Graceful Node Shutdown completed")
return nil
},
Expand Down

0 comments on commit 93c94be

Please sign in to comment.