Skip to content

Commit

Permalink
OCPBUGS-36301: parallelize member health checks
Browse files Browse the repository at this point in the history
https://issues.redhat.com/browse/OCPBUGS-36301

Currently,
member health is checked in serial with a 30s timeout per member.
3 out of 4 GetMemberHealth callers had their own default 30s timeout as well for the entire process.
Because of this,
a slow check on one member could exhaust the timeout for the entire GetMemberHealth function,
and thus cause later-checked members
to report as unhealthy even though they were fine.

With this commit,
I am dropping the internal 30s timeout from GetMemberHealth,
and instead letting the caller set the timeout.
Also, the code now checks the health of all members in parallel.
This will prevent a single slow member
from affecting the health reporting of other members.

I also added a timeout to the context used in IsMemberHealthy
which calls GetMemberHealth.
Neither Trevor nor I were sure why a default timeout wasn't present there,
though one was present in all other callsites.
  • Loading branch information
AlexVulaj authored and hasbro17 committed Jul 3, 2024
1 parent f8ec2ac commit 94d3821
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 22 deletions.
2 changes: 2 additions & 0 deletions pkg/etcdcli/etcdcli.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,8 @@ func (g *etcdClientGetter) IsMemberHealthy(ctx context.Context, member *etcdserv
if member == nil {
return false, fmt.Errorf("member can not be nil")
}
ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout)
defer cancel()
memberHealth := GetMemberHealth(ctx, []*etcdserverpb.Member{member})
if len(memberHealth) == 0 {
return false, fmt.Errorf("member health check failed")
Expand Down
31 changes: 9 additions & 22 deletions pkg/etcdcli/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,34 +43,21 @@ type memberHealth []healthCheck

func GetMemberHealth(ctx context.Context, etcdMembers []*etcdserverpb.Member) memberHealth {
memberHealth := memberHealth{}
resChan := make(chan healthCheck, 1)
for _, member := range etcdMembers {
if !HasStarted(member) {
memberHealth = append(memberHealth, healthCheck{Member: member, Healthy: false})
continue
}

const defaultTimeout = 30 * time.Second
resChan := make(chan healthCheck, 1)
go func() {
ctxTimeout, cancel := context.WithTimeout(ctx, defaultTimeout)
defer cancel()

resChan <- checkSingleMemberHealth(ctxTimeout, member)
// closing here to avoid late replies to panic on resChan,
// the result will be considered a timeout anyway
close(resChan)
}()

select {
case res := <-resChan:
memberHealth = append(memberHealth, res)
case <-time.After(defaultTimeout):
memberHealth = append(memberHealth, healthCheck{
Member: member,
Healthy: false,
Error: fmt.Errorf("30s timeout waiting for member %s to respond to health check",
member.Name)})
}
go func(m *etcdserverpb.Member) {
resChan <- checkSingleMemberHealth(ctx, m)
}(member)
}

for len(memberHealth) < len(etcdMembers) {
res := <-resChan
memberHealth = append(memberHealth, res)
}

// Purge any unknown members from the raft term metrics collector.
Expand Down

0 comments on commit 94d3821

Please sign in to comment.