Skip to content

Commit 897f5ce

Browse files
authored
refactor: refactor the implementation (#37)
1 parent 35903bb commit 897f5ce

File tree

4 files changed

+98
-15
lines changed

4 files changed

+98
-15
lines changed

config.go

+13-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type Config struct {
4040
maxJoinAttempts int
4141
// specifies the join retry interval
4242
joinRetryInterval time.Duration
43+
joinTimeout time.Duration
4344
// specifies the discovery provider
4445
provider discovery.Provider
4546
// specifies the node client port
@@ -89,9 +90,10 @@ func NewConfig() *Config {
8990
maxJoinAttempts: 5,
9091
joinRetryInterval: time.Second,
9192
shutdownTimeout: 3 * time.Second,
93+
joinTimeout: time.Minute,
9294
syncInterval: time.Minute,
9395
logger: log.New(log.ErrorLevel, os.Stderr),
94-
readTimeout: time.Second,
96+
readTimeout: time.Minute,
9597
}
9698
}
9799

@@ -143,14 +145,20 @@ func (config *Config) WithHost(host string) *Config {
143145
return config
144146
}
145147

146-
// WithSyncInterval sets the delegate sync interval
148+
// WithSyncInterval sets the cluster synchronization interval.
149+
// This is the interval between complete states synchronization between nodes.
150+
// Complete states synchronization are done with a single node over TCP and are
151+
// quite expensive relative to standard gossiped messages.
152+
// Setting this interval lower (more frequent) will increase convergence
153+
// speeds across larger clusters at the expense of increased bandwidth usage.
147154
func (config *Config) WithSyncInterval(interval time.Duration) *Config {
148155
config.syncInterval = interval
149156
return config
150157
}
151158

152159
// WithReadTimeout sets the Node read timeout.
153160
// This timeout specifies the timeout of a data retrieval
161+
// The read timeout should be either greater or equal to syncInterval
154162
func (config *Config) WithReadTimeout(timeout time.Duration) *Config {
155163
config.readTimeout = timeout
156164
return config
@@ -186,8 +194,11 @@ func (config *Config) Validate() error {
186194
AddAssertion(config.provider != nil, "discovery provider is not set").
187195
AddAssertion(config.joinRetryInterval > 0, "join retry interval is invalid").
188196
AddAssertion(config.shutdownTimeout > 0, "shutdown timeout is invalid").
197+
AddAssertion(config.joinTimeout > 0, "join timeout is invalid").
189198
AddAssertion(config.maxJoinAttempts > 0, "max join attempts is invalid").
190199
AddAssertion(config.syncInterval > 0, "stateSync interval is invalid").
200+
AddAssertion(config.readTimeout > 0, "read timeout is invalid").
201+
AddAssertion(config.joinTimeout > config.joinRetryInterval, "join timeout must greater than join retry interval").
191202
AddValidator(validation.NewEmptyStringValidator("host", config.host)).
192203
AddValidator(validation.NewConditionalValidator(len(config.secretKeys) != 0,
193204
validation.NewEmptyStringValidator("config.cookie", config.cookie))).

internal/tcp/tcp.go

+42-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,12 @@
2424

2525
package tcp
2626

27-
import "net"
27+
import (
28+
"fmt"
29+
"net"
30+
31+
"github.com/hashicorp/go-sockaddr"
32+
)
2833

2934
// GetHostPort returns the actual ip address and port from a given address
3035
func GetHostPort(address string) (string, int, error) {
@@ -36,3 +41,39 @@ func GetHostPort(address string) (string, int, error) {
3641

3742
return addr.IP.String(), addr.Port, nil
3843
}
44+
45+
// GetBindIP tries to find an appropriate bindIP to bind and propagate.
46+
func GetBindIP(address string) (string, error) {
47+
bindIP, _, err := GetHostPort(address)
48+
if err != nil {
49+
return "", fmt.Errorf("invalid address: %w", err)
50+
}
51+
52+
if bindIP == "0.0.0.0" {
53+
// if we're not bound to a specific IP, let's use a suitable private IP address.
54+
ipStr, err := sockaddr.GetPrivateIP()
55+
if err != nil {
56+
return "", fmt.Errorf("failed to get private interface addresses: %w", err)
57+
}
58+
59+
// if we could not find a private address, we need to expand our search to a public
60+
// ip address
61+
if ipStr == "" {
62+
ipStr, err = sockaddr.GetPublicIP()
63+
if err != nil {
64+
return "", fmt.Errorf("failed to get public interface addresses: %w", err)
65+
}
66+
}
67+
68+
if ipStr == "" {
69+
return "", fmt.Errorf("no private IP address found, and explicit IP not provided")
70+
}
71+
72+
parsed := net.ParseIP(ipStr)
73+
if parsed == nil {
74+
return "", fmt.Errorf("failed to parse private IP address: %q", ipStr)
75+
}
76+
bindIP = parsed.String()
77+
}
78+
return bindIP, nil
79+
}

node.go

+41-12
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"time"
3939

4040
"connectrpc.com/connect"
41+
"github.com/flowchartsman/retry"
4142
"github.com/hashicorp/memberlist"
4243
"go.uber.org/atomic"
4344
"google.golang.org/protobuf/types/known/timestamppb"
@@ -156,7 +157,7 @@ func (node *Node) Start(ctx context.Context) error {
156157
AddError(node.config.Validate()).
157158
AddError(node.config.provider.Initialize()).
158159
AddError(node.config.provider.Register()).
159-
AddError(node.join()).
160+
AddError(node.join(ctx)).
160161
AddError(node.serve(ctx)).
161162
Error(); err != nil {
162163
node.mu.Unlock()
@@ -231,17 +232,35 @@ func (node *Node) Put(ctx context.Context, request *connect.Request[internalpb.P
231232
}
232233

233234
// Get is used to retrieve a key/value pair in a cluster of nodes
234-
// nolint
235235
func (node *Node) Get(ctx context.Context, request *connect.Request[internalpb.GetRequest]) (*connect.Response[internalpb.GetResponse], error) {
236236
node.mu.Lock()
237237
if !node.started.Load() {
238238
node.mu.Unlock()
239239
return nil, connect.NewError(connect.CodeFailedPrecondition, ErrNodeNotStarted)
240240
}
241241

242+
ctx, cancelFn := context.WithTimeout(ctx, node.config.readTimeout)
243+
defer cancelFn()
244+
242245
req := request.Msg
243-
entry, err := node.delegate.Get(req.GetKey())
244-
if err != nil {
246+
var (
247+
rerr error
248+
entry *internalpb.Entry
249+
)
250+
251+
retrier := retry.NewRetrier(2, node.config.readTimeout, node.config.syncInterval)
252+
if err := retrier.RunContext(ctx, func(ctx context.Context) error {
253+
select {
254+
case <-ctx.Done():
255+
return ctx.Err()
256+
default:
257+
entry, rerr = node.delegate.Get(req.GetKey())
258+
if rerr != nil {
259+
return rerr
260+
}
261+
}
262+
return nil
263+
}); err != nil {
245264
node.mu.Unlock()
246265
return nil, connect.NewError(connect.CodeNotFound, err)
247266
}
@@ -342,13 +361,14 @@ func (node *Node) Peers() ([]*Member, error) {
342361
// serve start the underlying http server
343362
func (node *Node) serve(ctx context.Context) error {
344363
// extract the actual TCP ip discoveryAddress
345-
host, port, err := tcp.GetHostPort(fmt.Sprintf("%s:%d", node.config.host, node.config.port))
364+
hostPort := net.JoinHostPort(node.config.host, strconv.Itoa(int(node.config.port)))
365+
bindIP, err := tcp.GetBindIP(hostPort)
346366
if err != nil {
347367
return fmt.Errorf("failed to resolve TCP discoveryAddress: %w", err)
348368
}
349369

350-
node.config.WithHost(host)
351-
node.config.WithPort(uint16(port))
370+
node.config.WithHost(bindIP)
371+
node.config.WithPort(uint16(node.config.port))
352372

353373
// hook the node as the KV service handler
354374
// TODO: add metric options to the handler
@@ -372,20 +392,29 @@ func (node *Node) serve(ctx context.Context) error {
372392
}
373393

374394
// join attempts to join an existing cluster if node peers is provided
375-
func (node *Node) join() error {
395+
func (node *Node) join(ctx context.Context) error {
376396
mlist, err := memberlist.Create(node.memberConfig)
377397
if err != nil {
378398
node.config.logger.Error(fmt.Errorf("failed to create memberlist: %w", err))
379399
return err
380400
}
381401

382-
// TODO: use a retry mechanism here
383-
peers, err := node.config.provider.DiscoverPeers()
384-
if err != nil {
385-
node.config.logger.Error(fmt.Errorf("failed to discover peers: %w", err))
402+
ctx2, cancel := context.WithTimeout(ctx, node.config.joinTimeout)
403+
var peers []string
404+
retrier := retry.NewRetrier(node.config.maxJoinAttempts, node.config.joinRetryInterval, node.config.joinRetryInterval)
405+
if err := retrier.RunContext(ctx2, func(ctx context.Context) error { // nolint
406+
peers, err = node.config.provider.DiscoverPeers()
407+
if err != nil {
408+
return err
409+
}
410+
return nil
411+
}); err != nil {
412+
cancel()
386413
return err
387414
}
388415

416+
cancel()
417+
389418
// set the mlist
390419
node.memberlist = mlist
391420
if len(peers) > 0 {

node_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ func startNode(t *testing.T, serverAddr string) (*Node, discovery.Provider) {
251251
host: host,
252252
syncInterval: 500 * time.Millisecond,
253253
joinRetryInterval: 500 * time.Millisecond,
254+
joinTimeout: time.Second,
255+
readTimeout: 500 * time.Millisecond,
254256
maxJoinAttempts: 5,
255257
cookie: cookie,
256258
secretKeys: []string{b64},

0 commit comments

Comments
 (0)