Skip to content

Commit e390fae

Browse files
authored
Add coordinator fault tolerance check (#1827)
* Add coordinator fault tolerance check * Fix mock client for DNS setup
1 parent 05c98d7 commit e390fae

File tree

6 files changed

+416
-15
lines changed

6 files changed

+416
-15
lines changed

api/v1beta2/foundationdb_process_address.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,6 @@ func (address ProcessAddress) String() string {
256256
sb.WriteString(net.JoinHostPort(address.MachineAddress(), strconv.Itoa(address.Port)))
257257

258258
flags := address.SortedFlags()
259-
260259
if len(flags) > 0 {
261260
sb.WriteString(":" + strings.Join(flags, ":"))
262261
}

controllers/cluster_controller_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,8 +331,7 @@ var _ = Describe("cluster_controller", func() {
331331

332332
It("should update the pods", func() {
333333
pods := &corev1.PodList{}
334-
err = k8sClient.List(context.TODO(), pods, getListOptions(cluster)...)
335-
Expect(err).NotTo(HaveOccurred())
334+
Expect(k8sClient.List(context.TODO(), pods, getListOptions(cluster)...)).NotTo(HaveOccurred())
336335
for _, pod := range pods.Items {
337336
env := make(map[string]string)
338337
for _, envVar := range pod.Spec.InitContainers[0].Env {
@@ -342,6 +341,7 @@ var _ = Describe("cluster_controller", func() {
342341
}
343342
})
344343
})
344+
345345
Context("when buggifying a pod to make it crash loop", func() {
346346
BeforeEach(func() {
347347
cluster.Spec.Buggify.CrashLoop = []fdbv1beta2.ProcessGroupID{"storage-1"}

controllers/remove_process_groups_test.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ var _ = Describe("remove_process_groups", func() {
4343
Context("validating process removal", func() {
4444
BeforeEach(func() {
4545
cluster = internal.CreateDefaultCluster()
46-
err := k8sClient.Create(context.TODO(), cluster)
47-
Expect(err).NotTo(HaveOccurred())
46+
Expect(k8sClient.Create(context.TODO(), cluster)).NotTo(HaveOccurred())
4847

4948
result, err := reconcileCluster(cluster)
5049
Expect(err).NotTo(HaveOccurred())
@@ -193,15 +192,15 @@ var _ = Describe("remove_process_groups", func() {
193192
})
194193
})
195194

196-
When("Removing multiple process groups", func() {
195+
When("removing multiple process groups", func() {
197196
var initialCnt int
198197
var secondRemovedProcessGroup *fdbv1beta2.ProcessGroupStatus
199198

200199
When("the removal mode is the default zone", func() {
201200
BeforeEach(func() {
202201
Expect(cluster.Spec.AutomationOptions.RemovalMode).To(BeEmpty())
203202
initialCnt = len(cluster.Status.ProcessGroups)
204-
secondRemovedProcessGroup = cluster.Status.ProcessGroups[1]
203+
secondRemovedProcessGroup = cluster.Status.ProcessGroups[6]
205204
marked, processGroup := fdbv1beta2.MarkProcessGroupForRemoval(cluster.Status.ProcessGroups, secondRemovedProcessGroup.ProcessGroupID, secondRemovedProcessGroup.ProcessClass, removedProcessGroup.Addresses[0])
206205
Expect(marked).To(BeTrue())
207206
Expect(processGroup).To(BeNil())
@@ -303,7 +302,7 @@ var _ = Describe("remove_process_groups", func() {
303302
Expect(err).NotTo(HaveOccurred())
304303

305304
initialCnt = len(cluster.Status.ProcessGroups)
306-
secondRemovedProcessGroup = cluster.Status.ProcessGroups[1]
305+
secondRemovedProcessGroup = cluster.Status.ProcessGroups[6]
307306
marked, processGroup := fdbv1beta2.MarkProcessGroupForRemoval(cluster.Status.ProcessGroups, secondRemovedProcessGroup.ProcessGroupID, secondRemovedProcessGroup.ProcessClass, removedProcessGroup.Addresses[0])
308307
Expect(marked).To(BeTrue())
309308
Expect(processGroup).To(BeNil())

pkg/fdbadminclient/mock/admin_client_mock.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,13 @@ func (client *AdminClient) GetStatus() (*fdbv1beta2.FoundationDBStatus, error) {
228228
for _, envVar := range container.Env {
229229
if envVar.Name == "FDB_DNS_NAME" {
230230
locality[fdbv1beta2.FDBLocalityDNSNameKey] = envVar.Value
231+
232+
if client.Cluster.UseDNSInClusterFile() {
233+
fullAddress.StringAddress = envVar.Value
234+
}
235+
// TODO (johscheuer): This should be set to true. This will add additional information to the
236+
// return address and needs some additional refactoring in the mock code.
237+
// fullAddress.FromHostname = true
231238
}
232239
}
233240
}
@@ -350,6 +357,7 @@ func (client *AdminClient) GetStatus() (*fdbv1beta2.FoundationDBStatus, error) {
350357
status.Cluster.Clients.SupportedVersions = supportedVersions
351358
}
352359

360+
var countReachableCoordinators int
353361
for address, reachable := range coordinators {
354362
pAddr, err := fdbv1beta2.ParseProcessAddress(address)
355363
if err != nil {
@@ -360,8 +368,14 @@ func (client *AdminClient) GetStatus() (*fdbv1beta2.FoundationDBStatus, error) {
360368
Address: pAddr,
361369
Reachable: reachable,
362370
})
371+
372+
if reachable {
373+
countReachableCoordinators++
374+
}
363375
}
364376

377+
minReachableCoordinators := (client.Cluster.DesiredCoordinatorCount() + 1) / 2
378+
status.Client.Coordinators.QuorumReachable = countReachableCoordinators >= minReachableCoordinators
365379
status.Client.DatabaseStatus.Available = true
366380
status.Client.DatabaseStatus.Healthy = true
367381

pkg/fdbstatus/status_checks.go

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -390,9 +390,26 @@ func DoLogServerFaultDomainCheckOnStatus(status *fdbv1beta2.FoundationDBStatus)
390390
}
391391

392392
// DoCoordinatorFaultDomainCheckOnStatus does a coordinator related fault domain check over the given status object.
393-
// @note an empty function for now. We will revisit this later.
394-
func DoCoordinatorFaultDomainCheckOnStatus(_ *fdbv1beta2.FoundationDBStatus) error {
395-
// TODO: decide if we need to do coordinator related check.
393+
func DoCoordinatorFaultDomainCheckOnStatus(status *fdbv1beta2.FoundationDBStatus) error {
394+
notReachable := make([]string, 0, len(status.Client.Coordinators.Coordinators))
395+
for _, coordinator := range status.Client.Coordinators.Coordinators {
396+
if coordinator.Reachable {
397+
continue
398+
}
399+
400+
notReachable = append(notReachable, coordinator.Address.String())
401+
}
402+
403+
if len(notReachable) > 0 {
404+
return fmt.Errorf("not all coordinators are reachable, unreachable coordinators: %s", strings.Join(notReachable, ","))
405+
}
406+
407+
// If this is the case the statements above should already catch the unreachable coordinators and printout a more
408+
// detailed message.
409+
if !status.Client.Coordinators.QuorumReachable {
410+
return fmt.Errorf("quorum of coordinators is not reachable")
411+
}
412+
396413
return nil
397414
}
398415

@@ -430,7 +447,6 @@ func HasDesiredFaultToleranceFromStatus(log logr.Logger, status *fdbv1beta2.Foun
430447
return false
431448
}
432449

433-
// TODO (johscheuer): Should those checks be specific to the Kubernetes cluster (DC) we are requesting?
434450
// Should we also add a method to check the different process classes? Currently the degraded log fault tolerance
435451
// will block the removal of a storage process.
436452
err := DoStorageServerFaultDomainCheckOnStatus(status)
@@ -445,5 +461,11 @@ func HasDesiredFaultToleranceFromStatus(log logr.Logger, status *fdbv1beta2.Foun
445461
return false
446462
}
447463

464+
err = DoCoordinatorFaultDomainCheckOnStatus(status)
465+
if err != nil {
466+
log.Info("Fault domain check for coordinator servers failed", "error", err)
467+
return false
468+
}
469+
448470
return true
449471
}

0 commit comments

Comments
 (0)