Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
YZ775 committed Feb 3, 2025
1 parent 695977b commit 0cd28be
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 20 deletions.
4 changes: 4 additions & 0 deletions etc/cke-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ repair:
watch_seconds: 600
- repair_command: ["sh", "neco nonGracefulNodeShutdown shutdown $1", "nonGracefulNodeShutdown"]
command_timeout_seconds: 300
command_retries: 3
command_interval: 30
need_drain: true
- repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
Expand All @@ -64,6 +66,8 @@ repair:
repair_steps:
- repair_command: ["sh", "neco nonGracefulNodeShutdown shutdown $1", "nonGracefulNodeShutdown"]
command_timeout_seconds: 300
command_retries: 3
command_interval: 30
need_drain: false
- repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
Expand Down
16 changes: 8 additions & 8 deletions pkg/neco/cmd/non_graceful_shutdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ var (
)

var nonGracefulNodeShutdownCmd = &cobra.Command{
Use: "nonGracefulNodeShutdown",
Short: "nonGracefulNodeShutdown related commands",
Long: `nonGracefulNodeShutdown related commands.`,
Use: "non-graceful-node-shutdown",
Short: "non-Graceful Node Shutdown related commands",
Long: `non-Graceful Node Shutdown related commands.`,
}

type CephCluster struct {
Expand Down Expand Up @@ -63,12 +63,12 @@ func issueAndLoadKubeconfigForNonGracefulNodeShutdown() (client.Client, error) {
return kubeClient, nil
}

func listRBDCephClusters(ctx context.Context, kubeClient client.Client) (error, []CephCluster) {
func listRBDCephClusters(ctx context.Context, kubeClient client.Client) ([]CephCluster, error) {
cephClusters := []CephCluster{}
scs := &storagev1.StorageClassList{}
err := kubeClient.List(ctx, scs)
if err != nil {
return err, nil
return nil, err
}
cephClusterIDs := []string{}
for _, sc := range scs.Items {
Expand All @@ -82,14 +82,14 @@ func listRBDCephClusters(ctx context.Context, kubeClient client.Client) (error,
cephCluster.SetKind("CephCluster")
err = kubeClient.List(ctx, cephCluster, &client.ListOptions{Namespace: cephClusterID})
if err != nil {
return err, nil
return nil, err
}
if len(cephCluster.Items) != 1 {
return errors.New("cephCluster is not found or multiple cephClusters are found"), nil
return nil, errors.New("cephCluster is not found or multiple cephClusters are found")
}
cephClusters = append(cephClusters, CephCluster{Name: cephCluster.Items[0].GetName(), NameSpace: cephCluster.Items[0].GetNamespace()})
}
return nil, cephClusters
return cephClusters, nil
}

func generateFenceName(clusterName, node string) string {
Expand Down
14 changes: 7 additions & 7 deletions pkg/neco/cmd/non_graceful_shutdown_cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
sabakanStatus := machines[0].Status.State

// remove networkfence
err, cephClusters := listRBDCephClusters(ctx, kubeClient)
cephClusters, err := listRBDCephClusters(ctx, kubeClient)
if err != nil {
return err
}
Expand All @@ -65,26 +65,26 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
return err
}
}
// set fence state to Unfenced
networkFence.Spec.FenceState = csiaddonsv1alpha1.Unfenced
networkFence.Status = csiaddonsv1alpha1.NetworkFenceStatus{}
err = kubeClient.Update(ctx, networkFence)
if err != nil {
return err
}
fmt.Printf("Unfence NetworkFence %s\n", networkFence.Name)

// wait for unfense of networkfence to be Succeeded
// wait for unfence of networkfence to be Succeeded
fmt.Printf("Waiting for Unfence operation to be Succeeded\n")
for {
select {
case <-ctx.Done():
return errors.New("Cancelled waiting for Unfence to be Succeeded")
return errors.New("cancelled waiting for Unfence to be Succeeded")
default:
}
err := kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, networkFence)
if err != nil {
return err
}
if networkFence.Status.Result == csiaddonsv1alpha1.FencingOperationResultSucceeded {
if networkFence.Status.Result == csiaddonsv1alpha1.FencingOperationResultSucceeded && networkFence.Status.Message == csiaddonsv1alpha1.UnFenceOperationSuccessfulMessage {
break
}
time.Sleep(5 * time.Second)
Expand All @@ -93,7 +93,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
if err != nil {
return err
}
fmt.Printf("NetworkFence %s removed\n", networkFence.Name)
fmt.Printf("Unfence operation for NetworkFence %s is succeeded and it is removed\n", networkFence.Name)
return nil
})
}
Expand Down
13 changes: 8 additions & 5 deletions pkg/neco/cmd/non_graceful_shutdown_shutdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
Use: "shutdown IP_ADDRESS",
Short: "non-graceful shutdown the node",
Long: `power off the node and create NetworkFence and then add taint to the node`,
Long: `Power off the node and create NetworkFence and then add taint to the node`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
node := args[0]
Expand Down Expand Up @@ -58,11 +58,11 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
return err
}
//wait for the node to be down
fmt.Printf("Waiting for the node %s to be down\n", node)
fmt.Printf("Waiting for the node %s to be power off\n", node)
for {
select {
case <-ctx.Done():
return errors.New("Canncelled waiting for the node to be down")
return errors.New("cancelled waiting for the node to be power off")
default:
}
out.Reset()
Expand All @@ -78,10 +78,11 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
time.Sleep(5 * time.Second)
}
}
fmt.Printf("Node %s is power off\n", node)

// Create NetworkFence for ceph clusters
g := errgroup.Group{}
err, cephClusters := listRBDCephClusters(ctx, kubeClient)
cephClusters, err := listRBDCephClusters(ctx, kubeClient)
if err != nil {
return err
}
Expand Down Expand Up @@ -117,11 +118,12 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
}
fmt.Printf("NetworkFence %s created\n", networkFence.Name)
// wait for fence of networkfence to be Succeeded
fmt.Println("Waiting for the Fence operation to be succeeded")
networkFence = csiaddonsv1alpha1.NetworkFence{}
for {
select {
case <-ctx.Done():
return errors.New("Cancelled waiting for Fence to be Succeeded")
return errors.New("cancelled waiting for Fence to be succeeded")
default:
}
err := kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, &networkFence)
Expand All @@ -133,6 +135,7 @@ var nonGracefulNodeShutdownShutdownCmd = &cobra.Command{
}
time.Sleep(5 * time.Second)
}
fmt.Printf("Fence operation for NetworkFence %s is succeeded\n", networkFence.Name)
return nil
})
}
Expand Down

0 comments on commit 0cd28be

Please sign in to comment.