-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[feature] Support replication lag checking #4
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,13 +18,16 @@ package hasql | |
|
||
import ( | ||
"context" | ||
"fmt" | ||
"sort" | ||
"sync" | ||
"time" | ||
) | ||
|
||
type checkedNode struct { | ||
Node Node | ||
Node Node | ||
|
||
Primary bool | ||
Latency time.Duration | ||
} | ||
|
||
|
@@ -89,11 +92,11 @@ func (nodes groupedCheckedNodes) Alive() []Node { | |
return res | ||
} | ||
|
||
type checkExecutorFunc func(ctx context.Context, node Node) (bool, time.Duration, error) | ||
type checkExecutorFunc func(ctx context.Context, node *checkedNode) error | ||
|
||
// checkNodes takes slice of nodes, checks them in parallel and returns the alive ones. | ||
// Accepts customizable executor which enables time-independent tests for node sorting based on 'latency'. | ||
func checkNodes(ctx context.Context, nodes []Node, executor checkExecutorFunc, tracer Tracer) AliveNodes { | ||
func checkNodes(ctx context.Context, nodes []Node, tracer Tracer, executors ...checkExecutorFunc) AliveNodes { | ||
checkedNodes := groupedCheckedNodes{ | ||
Primaries: make(checkedNodesList, 0, len(nodes)), | ||
Standbys: make(checkedNodesList, 0, len(nodes)), | ||
|
@@ -106,24 +109,26 @@ func checkNodes(ctx context.Context, nodes []Node, executor checkExecutorFunc, t | |
go func(node Node, wg *sync.WaitGroup) { | ||
defer wg.Done() | ||
|
||
primary, duration, err := executor(ctx, node) | ||
if err != nil { | ||
if tracer.NodeDead != nil { | ||
tracer.NodeDead(node, err) | ||
} | ||
nl := checkedNode{Node: node} | ||
|
||
return | ||
for _, executor := range executors { | ||
err := executor(ctx, &nl) | ||
if err != nil { | ||
if tracer.NodeDead != nil { | ||
tracer.NodeDead(node, err) | ||
} | ||
|
||
return | ||
} | ||
} | ||
|
||
if tracer.NodeAlive != nil { | ||
tracer.NodeAlive(node) | ||
} | ||
|
||
nl := checkedNode{Node: node, Latency: duration} | ||
|
||
mu.Lock() | ||
defer mu.Unlock() | ||
if primary { | ||
if nl.Primary { | ||
checkedNodes.Primaries = append(checkedNodes.Primaries, nl) | ||
} else { | ||
checkedNodes.Standbys = append(checkedNodes.Standbys, nl) | ||
|
@@ -142,16 +147,37 @@ func checkNodes(ctx context.Context, nodes []Node, executor checkExecutorFunc, t | |
} | ||
} | ||
|
||
// checkExecutor returns checkExecutorFunc which can execute supplied check. | ||
func checkExecutor(checker NodeChecker) checkExecutorFunc { | ||
return func(ctx context.Context, node Node) (bool, time.Duration, error) { | ||
// checkRoleExecutor returns checkExecutorFunc which can execute role check. | ||
func checkRoleExecutor(checker NodeChecker) checkExecutorFunc { | ||
return func(ctx context.Context, target *checkedNode) error { | ||
ts := time.Now() | ||
primary, err := checker(ctx, node.DB()) | ||
primary, err := checker(ctx, target.Node.DB()) | ||
d := time.Since(ts) | ||
if err != nil { | ||
return false, d, err | ||
return fmt.Errorf("unable to check node role: %w", err) | ||
} | ||
|
||
target.Primary = primary | ||
target.Latency = d | ||
|
||
return nil | ||
} | ||
} | ||
|
||
// checkReplicationLagExecutor returns checkExecutorFunc which can execute replication lag check. | ||
func checkReplicationLagExecutor(checker ReplicationLagChecker, maxLag time.Duration) checkExecutorFunc { | ||
return func(ctx context.Context, target *checkedNode) error { | ||
if checker == nil || maxLag == 0 || target.Primary { | ||
return nil | ||
} | ||
|
||
return primary, d, nil | ||
lag, err := checker(ctx, target.Node.DB()) | ||
if err != nil { | ||
return fmt.Errorf("cannot check node replication lag: %w", err) | ||
} | ||
if lag < maxLag { | ||
return fmt.Errorf("replication lag is too big: %s", lag) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need typed errors since tracer.NodeDead can return error for both 'cant check' and 'too much lag'. The question is do we need to create an actualy type for replication lag? User might want to know the exact lag. Or he might not. Sentinels are a bit easier to work with but provide less info. |
||
} | ||
return nil | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,9 +19,15 @@ package checkers | |
import ( | ||
"context" | ||
"database/sql" | ||
"time" | ||
) | ||
|
||
// PostgreSQL checks whether PostgreSQL server is primary or not. | ||
func PostgreSQL(ctx context.Context, db *sql.DB) (bool, error) { | ||
return Check(ctx, db, "SELECT NOT pg_is_in_recovery()") | ||
} | ||
|
||
// PostgreSQLReplicationLag returns replication lag value for PostgreSQL replica node. | ||
func PostgreSQLReplicationLag(ctx context.Context, db *sql.DB) (time.Duration, error) { | ||
return ReplicationLag(ctx, db, "SELECT NOW() - pg_last_xact_replay_timestamp()") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately on low-activity cluster this would lead to marking all replicas as lagging. |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,9 @@ type Cluster struct { | |
updateTimeout time.Duration | ||
checker NodeChecker | ||
picker NodePicker | ||
// Replication lag configuration | ||
lagChecker ReplicationLagChecker | ||
maxLagValue time.Duration | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maxAllowedLag is more descriptive IMO. |
||
|
||
// Status | ||
updateStopper chan struct{} | ||
|
@@ -318,7 +321,12 @@ func (cl *Cluster) updateNodes() { | |
ctx, cancel := context.WithTimeout(context.Background(), cl.updateTimeout) | ||
defer cancel() | ||
|
||
alive := checkNodes(ctx, cl.nodes, checkExecutor(cl.checker), cl.tracer) | ||
checkExecutors := []checkExecutorFunc{ | ||
checkRoleExecutor(cl.checker), | ||
checkReplicationLagExecutor(cl.lagChecker, cl.maxLagValue), | ||
} | ||
|
||
alive := checkNodes(ctx, cl.nodes, cl.tracer, checkExecutors...) | ||
cl.aliveNodes.Store(alive) | ||
|
||
if cl.tracer.UpdatedNodes != nil { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really don't like adding more transitory values to node. This provokes accessing them after check itself when these values might be invalid.
Can you please add a comment before these values making it clear they are transitory and should not be used after the check itself? Also making them private seems like a good idea.