-
Notifications
You must be signed in to change notification settings - Fork 84
/
Copy pathmaintenance.go
128 lines (107 loc) · 5.84 KB
/
maintenance.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
* maintenance.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package maintenance
import (
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
"github.com/go-logr/logr"
"time"
)
// GetMaintenanceInformation returns the information about processes that have finished, stale information in the maintenance list and processes that still must be updated.
func GetMaintenanceInformation(logger logr.Logger, status *fdbv1beta2.FoundationDBStatus, processesUnderMaintenance map[fdbv1beta2.ProcessGroupID]int64, staleDuration time.Duration, differentZoneWaitDuration time.Duration) ([]fdbv1beta2.ProcessGroupID, []fdbv1beta2.ProcessGroupID, []fdbv1beta2.ProcessGroupID) {
finishedMaintenance := make([]fdbv1beta2.ProcessGroupID, 0, len(processesUnderMaintenance))
staleMaintenanceInformation := make([]fdbv1beta2.ProcessGroupID, 0, len(processesUnderMaintenance))
processesToUpdate := make([]fdbv1beta2.ProcessGroupID, 0, len(processesUnderMaintenance))
// If the provided status is empty return all processes to be updated.
if status == nil {
for processGroupID := range processesUnderMaintenance {
processesToUpdate = append(processesToUpdate, processGroupID)
}
logger.Info("provided status is empty")
return nil, nil, processesToUpdate
}
logger.Info("start evaluation", "processesUnderMaintenance", processesUnderMaintenance)
// If no processes are in the maintenance list, we can skip further checks and we don't have to iterate over
// all processes in the cluster.
if len(processesUnderMaintenance) == 0 {
return nil, nil, nil
}
for _, process := range status.Cluster.Processes {
// Only storage processes are affected by the maintenance mode.
if process.ProcessClass != fdbv1beta2.ProcessClassStorage {
continue
}
processGroupID, ok := process.Locality[fdbv1beta2.FDBLocalityInstanceIDKey]
if !ok {
continue
}
// Check if the provided process is under maintenance, if not we can skip further checks.
maintenanceStart, isUnderMaintenance := processesUnderMaintenance[fdbv1beta2.ProcessGroupID(processGroupID)]
if !isUnderMaintenance {
continue
}
// Get the start time of the processes, based on the current time and the uptime seconds reported by the process.
startTime := time.Now().Add(-1 * time.Duration(process.UptimeSeconds) * time.Second)
maintenanceStartTime := time.Unix(maintenanceStart, 0)
zoneID, ok := process.Locality[fdbv1beta2.FDBLocalityZoneIDKey]
if !ok {
continue
}
logger.Info("found process under maintenance", "processGroupID", processGroupID, "zoneID", zoneID, "currentMaintenance", status.Cluster.MaintenanceZone, "startTime", startTime.String(), "maintenanceStartTime", maintenanceStartTime.String(), "UptimeSeconds", process.UptimeSeconds)
// Remove the process group ID from processesUnderMaintenance as we have found a processes.
delete(processesUnderMaintenance, fdbv1beta2.ProcessGroupID(processGroupID))
// If the start time is after the maintenance start time, we can assume that maintenance for this specific process is done.
if startTime.After(maintenanceStartTime) {
finishedMaintenance = append(finishedMaintenance, fdbv1beta2.ProcessGroupID(processGroupID))
continue
}
// If the zones are not matching those are probably stale entries. Once they are long enough in the list of
// entries they will be removed.
if zoneID != string(status.Cluster.MaintenanceZone) {
// If the entry was recently added, per default less than 5 minutes, we are adding it to the processesToUpdate
// list, even if the zones are not matching. We are doing this to reduce the risk of the operator acting on
// a stale version of the machine-readable status, e.g. because of CPU throttling or the operator
// caching the machine-readable status and taking a long time to reconcile.
durationSinceMaintenanceStarted := time.Since(maintenanceStartTime)
if durationSinceMaintenanceStarted < differentZoneWaitDuration {
processesToUpdate = append(processesToUpdate, fdbv1beta2.ProcessGroupID(processGroupID))
}
// If the maintenance start time is longer ago than the defined stale duration, we can assume that this is
// an old entry that should be cleaned up.
if durationSinceMaintenanceStarted > staleDuration {
staleMaintenanceInformation = append(staleMaintenanceInformation, fdbv1beta2.ProcessGroupID(processGroupID))
}
continue
}
processesToUpdate = append(processesToUpdate, fdbv1beta2.ProcessGroupID(processGroupID))
}
// After we checked above the processes that are done with their maintenance and the processes that still must be
// restarted we have to filter out all stale entries. We filter out those stale entries to make sure the entries
// are eventually cleaned up.
for processGroupID, maintenanceStart := range processesUnderMaintenance {
// If the maintenance start time is longer ago than the defined stale duration, we can assume that this is
// an old entry that should be cleaned up.
if time.Since(time.Unix(maintenanceStart, 0)) > staleDuration {
staleMaintenanceInformation = append(staleMaintenanceInformation, processGroupID)
continue
}
processesToUpdate = append(processesToUpdate, processGroupID)
}
return finishedMaintenance, staleMaintenanceInformation, processesToUpdate
}