-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathmaintenance_mode_checker.go
105 lines (89 loc) · 4.08 KB
/
maintenance_mode_checker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
* maintenance_mode_checker.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controllers
import (
"context"
"fmt"
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
"github.com/FoundationDB/fdb-kubernetes-operator/internal/maintenance"
"github.com/go-logr/logr"
"time"
)
// maintenanceModeChecker provides a reconciliation step for clearing the maintenance mode if all the processes in the current maintenance zone have been restarted.
type maintenanceModeChecker struct{}
// reconcile runs the reconciler's work.
func (c maintenanceModeChecker) reconcile(_ context.Context, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, logger logr.Logger) *requeue {
if !cluster.ResetMaintenanceMode() {
return nil
}
adminClient, err := r.getAdminClient(logger, cluster)
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
// If the status is not cached, we have to fetch it.
if status == nil {
status, err = adminClient.GetStatus()
if err != nil {
return &requeue{curError: err}
}
}
// If the cluster is not available we skip any further checks.
if !status.Client.DatabaseStatus.Available {
return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second}
}
// Get all the processes that are currently under maintenance based on the information stored in FDB.
processesUnderMaintenance, err := adminClient.GetProcessesUnderMaintenance()
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
if status.Cluster.MaintenanceZone != "" {
logger.Info("Cluster in maintenance mode", "zone", status.Cluster.MaintenanceZone, "processesUnderMaintenance", processesUnderMaintenance)
}
// Get all the maintenance information from the FDB cluster.
finishedMaintenance, staleMaintenanceInformation, processesToUpdate := maintenance.GetMaintenanceInformation(logger, status, processesUnderMaintenance, r.MaintenanceListStaleDuration, r.MaintenanceListWaitDuration)
logger.Info("maintenance information", "finishedMaintenance", finishedMaintenance, "staleMaintenanceInformation", staleMaintenanceInformation, "processesToUpdate", processesToUpdate)
// We can remove the information for all the finished maintenance and the stale entries.
if len(finishedMaintenance) > 0 || len(staleMaintenanceInformation) > 0 {
// Remove all the processes that finished maintenance and all the stale information.
err = adminClient.RemoveProcessesUnderMaintenance(append(finishedMaintenance, staleMaintenanceInformation...))
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
}
// If no maintenance zone is active, we can ignore all further steps to reset the maintenance zone.
if status.Cluster.MaintenanceZone == "" {
return nil
}
// Some of the processes are not yet restarted.
if len(processesToUpdate) > 0 {
return &requeue{message: fmt.Sprintf("Waiting for %d processes in zone %s to be updated", len(processesToUpdate), status.Cluster.MaintenanceZone), delayedRequeue: true, delay: 5 * time.Second}
}
// Make sure we take a lock before we continue.
hasLock, err := r.takeLock(logger, cluster, "maintenance mode check")
if !hasLock {
return &requeue{curError: err}
}
logger.Info("Switching off maintenance mode", "zone", status.Cluster.MaintenanceZone)
err = adminClient.ResetMaintenanceMode()
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
return nil
}