dotandev · Hallab7 · Mar 26, 2026
diff --git a/docs/GLOBAL_TIMEOUT.md b/docs/GLOBAL_TIMEOUT.md
@@ -0,0 +1,193 @@
+# Global Timeout for Multi-Node Failover
+
+## Overview
+
+The SDK now supports a global timeout feature that caps the total time spent across all RPC nodes during failover operations. This prevents scenarios where slow nodes cause the entire failover loop to take too long.
+
+## Problem Solved
+
+Previously, if all RPC nodes were slow (but not completely unresponsive), the failover loop could take an excessive amount of time. For example, with 3 nodes each taking 30 seconds to timeout, the total operation could take up to 90 seconds.
+
+With the global timeout feature, you can set a maximum time limit for the entire failover operation, regardless of how many nodes are configured.
+
+## Configuration
+
+### Go SDK
+
+#### Using NetworkConfig
+
+```go
+config := rpc.NetworkConfig{
+    Name:              "custom",
+    HorizonURL:        "https://horizon.stellar.org",
+    NetworkPassphrase: "Public Global Stellar Network ; September 2015",
+    SorobanRPCURL:     "https://soroban-rpc.stellar.org",
+    TotalTimeout:      30 * time.Second, // 30 second global timeout
+}
+
+client, err := rpc.NewClient(rpc.WithNetworkConfig(config))
+```
+
+#### Using Builder Options
+
+```go
+client, err := rpc.NewClient(
+    rpc.WithNetwork(rpc.Mainnet),
+    rpc.WithAltURLs([]string{
+        "https://horizon1.stellar.org",
+        "https://horizon2.stellar.org", 
+        "https://horizon3.stellar.org",
+    }),
+    rpc.WithTotalTimeout(45 * time.Second), // 45 second global timeout
+)
+```
+
+#### Default Values
+
+The predefined network configurations have a default global timeout of 60 seconds:
+- `TestnetConfig.TotalTimeout = 60 * time.Second`
+- `MainnetConfig.TotalTimeout = 60 * time.Second`
+- `FuturenetConfig.TotalTimeout = 60 * time.Second`
+
+### TypeScript SDK
+
+#### Using RPCConfig
+
+```typescript
+import { FallbackRPCClient, RPCConfigParser } from './rpc';
+
+const config = RPCConfigParser.loadConfig({
+    rpc: ['https://rpc1.stellar.org', 'https://rpc2.stellar.org'],
+    timeout: 30000,      // Individual request timeout (30s)
+    totalTimeout: 60000, // Global timeout for all nodes (60s)
+    retries: 3,
+});
+
+const client = new FallbackRPCClient(config);
+```
+
+#### Default Values
+
+The TypeScript SDK has a default global timeout of 60 seconds (60000ms) when using `RPCConfigParser.loadConfig()`.
+
+## Behavior
+
+### When Global Timeout is Enabled
+
+1. **Timeout Enforcement**: The SDK starts a timer when beginning the failover loop
+2. **Context Cancellation**: If the global timeout is reached, the operation is cancelled immediately
+3. **Error Response**: A timeout error is returned indicating the global timeout was exceeded
+4. **Early Termination**: The SDK stops trying additional nodes once the timeout is reached
+
+### When Global Timeout is Disabled
+
+- Set `TotalTimeout` to `0` (Go) or `totalTimeout` to `0` (TypeScript) to disable
+- The SDK will attempt all configured nodes regardless of total time spent
+- Individual request timeouts still apply
+
+## Affected Methods
+
+The global timeout applies to all methods that use multi-node failover:
+
+### Go SDK
+- `GetTransaction()`
+- `GetLedgerHeader()`
+- `SimulateTransaction()`
+- `GetHealth()`
+- `GetLedgerEntries()` (single batch mode)
+
+### TypeScript SDK
+- All methods that use `FallbackRPCClient.request()`
+- `getTransaction()`
+- `simulateTransaction()`
+- `getHealth()`
+- `getLatestLedger()`
+
+## Examples
+
+### Scenario 1: Fast Failover
+
+```go
+// 3 nodes, each taking 2 seconds to fail
+// Without global timeout: 6+ seconds total
+// With 4-second global timeout: ~4 seconds total
+
+client, _ := rpc.NewClient(
+    rpc.WithAltURLs([]string{"slow1", "slow2", "slow3"}),
+    rpc.WithTotalTimeout(4 * time.Second),
+)
+
+start := time.Now()
+_, err := client.GetTransaction(ctx, "hash")
+elapsed := time.Since(start) // ~4 seconds, not 6+
+```
+
+### Scenario 2: Success Within Timeout
+
+```go
+// First node fails quickly, second succeeds
+// Total time: <1 second (well within 10s timeout)
+
+client, _ := rpc.NewClient(
+    rpc.WithAltURLs([]string{"fail-fast", "success"}),
+    rpc.WithTotalTimeout(10 * time.Second),
+)
+
+resp, err := client.GetTransaction(ctx, "hash") // Success
+```
+
+## Error Handling
+
+### Go SDK
+
+```go
+_, err := client.GetTransaction(ctx, "hash")
+if err != nil {
+    if strings.Contains(err.Error(), "global timeout exceeded") {
+        // Handle global timeout
+        log.Printf("Operation timed out after %v", client.Config.TotalTimeout)
+    } else {
+        // Handle other errors
+        log.Printf("Other error: %v", err)
+    }
+}
+```
+
+### TypeScript SDK
+
+```typescript
+try {
+    const result = await client.request('/transaction/hash');
+} catch (error) {
+    if (error.message.includes('Global timeout exceeded')) {
+        // Handle global timeout
+        console.log(`Operation timed out after ${config.totalTimeout}ms`);
+    } else {
+        // Handle other errors
+        console.log(`Other error: ${error.message}`);
+    }
+}
+```
+
+## Best Practices
+
+1. **Set Reasonable Timeouts**: Consider your application's latency requirements
+2. **Monitor Metrics**: Track timeout occurrences to tune the timeout value
+3. **Fallback Strategy**: Have a plan for when all nodes timeout
+4. **Individual vs Global**: Set individual request timeouts shorter than global timeout
+5. **Testing**: Test timeout behavior with slow/unresponsive test servers
+
+## Migration
+
+### Existing Code
+
+Existing code continues to work without changes. The default 60-second global timeout provides reasonable protection against excessively long failover operations.
+
+### Upgrading
+
+To take advantage of custom global timeouts:
+
+1. **Go**: Use `WithTotalTimeout()` builder option or set `TotalTimeout` in `NetworkConfig`
+2. **TypeScript**: Pass `totalTimeout` to `RPCConfigParser.loadConfig()`
+
+No breaking changes are introduced by this feature.
diff --git a/internal/rpc/builder.go b/internal/rpc/builder.go
@@ -127,6 +127,20 @@ func WithRequestTimeout(d time.Duration) ClientOption {
 	}
 }
 
+// WithTotalTimeout sets a global timeout for multi-node failover loops.
+// This caps the total time spent across all nodes during failover operations.
+// If not set, the default from NetworkConfig is used.
+func WithTotalTimeout(d time.Duration) ClientOption {
+	return func(b *clientBuilder) error {
+		if b.config == nil {
+			cfg := b.getConfig(b.network)
+			b.config = &cfg
+		}
+		b.config.TotalTimeout = d
+		return nil
+	}
+}
+
 func WithHTTPClient(client *http.Client) ClientOption {
 	return func(b *clientBuilder) error {
 		b.httpClient = client

diff --git a/internal/rpc/client.go b/internal/rpc/client.go
@@ -71,6 +71,7 @@ type NetworkConfig struct {
 	HorizonURL        string
 	NetworkPassphrase string
 	SorobanRPCURL     string
+	TotalTimeout      time.Duration // Global timeout for multi-node failover loops
 }
 
 // Predefined network configurations
@@ -80,20 +81,23 @@ var (
 		HorizonURL:        TestnetHorizonURL,
 		NetworkPassphrase: "Test SDF Network ; September 2015",
 		SorobanRPCURL:     TestnetSorobanURL,
+		TotalTimeout:      60 * time.Second, // Default 60 seconds for failover
 	}
 
 	MainnetConfig = NetworkConfig{
 		Name:              "mainnet",
 		HorizonURL:        MainnetHorizonURL,
 		NetworkPassphrase: "Public Global Stellar Network ; September 2015",
 		SorobanRPCURL:     MainnetSorobanURL,
+		TotalTimeout:      60 * time.Second, // Default 60 seconds for failover
 	}
 
 	FuturenetConfig = NetworkConfig{
 		Name:              "futurenet",
 		HorizonURL:        FuturenetHorizonURL,
 		NetworkPassphrase: "Test SDF Future Network ; October 2022",
 		SorobanRPCURL:     FuturenetSorobanURL,
+		TotalTimeout:      60 * time.Second, // Default 60 seconds for failover
 	}
 )
 
@@ -487,9 +491,26 @@ type StellarbeatResponse struct {
 
 // GetTransaction fetches the transaction details and full XDR data
 func (c *Client) GetTransaction(ctx context.Context, hash string) (*TransactionResponse, error) {
+	// Apply global timeout if configured
+	if c.Config.TotalTimeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, c.Config.TotalTimeout)
+		defer cancel()
+	}
+
 	attempts := c.endpointAttempts()
 	var failures []NodeFailure
 	for attempt := 0; attempt < attempts; attempt++ {
+		// Check if context has been cancelled or timed out
+		select {
+		case <-ctx.Done():
+			if ctx.Err() == context.DeadlineExceeded {
+				return nil, errors.WrapRPCTimeout(fmt.Errorf("global timeout exceeded after %v", c.Config.TotalTimeout))
+			}
+			return nil, ctx.Err()
+		default:
+		}
+
 		resp, err := c.getTransactionAttempt(ctx, hash)
 		if err == nil {
 			c.markHorizonSuccess()
@@ -636,9 +657,26 @@ type GetLedgerEntriesResponse struct {
 //
 // GetLedgerHeader fetches ledger header details for a specific sequence with automatic fallback.
 func (c *Client) GetLedgerHeader(ctx context.Context, sequence uint32) (*LedgerHeaderResponse, error) {
+	// Apply global timeout if configured
+	if c.Config.TotalTimeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, c.Config.TotalTimeout)
+		defer cancel()
+	}
+
 	attempts := c.endpointAttempts()
 	var failures []NodeFailure
 	for attempt := 0; attempt < attempts; attempt++ {
+		// Check if context has been cancelled or timed out
+		select {
+		case <-ctx.Done():
+			if ctx.Err() == context.DeadlineExceeded {
+				return nil, errors.WrapRPCTimeout(fmt.Errorf("global timeout exceeded after %v", c.Config.TotalTimeout))
+			}
+			return nil, ctx.Err()
+		default:
+		}
+
 		resp, err := c.getLedgerHeaderAttempt(ctx, sequence)
 		if err == nil {
 			c.markHorizonSuccess()
@@ -851,8 +889,24 @@ func (c *Client) GetLedgerEntries(ctx context.Context, keys []string) (map[strin
 	}
 
 	// Single batch - use existing failover logic
+	// Apply global timeout if configured
+	if c.Config.TotalTimeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, c.Config.TotalTimeout)
+		defer cancel()
+	}
+
 	attempts := c.endpointAttempts()
 	for attempt := 0; attempt < attempts; attempt++ {
+		// Check if context has been cancelled or timed out
+		select {
+		case <-ctx.Done():
+			if ctx.Err() == context.DeadlineExceeded {
+				return nil, errors.WrapRPCTimeout(fmt.Errorf("global timeout exceeded after %v", c.Config.TotalTimeout))
+			}
+			return nil, ctx.Err()
+		default:
+		}
 
 		fetchedEntries, err := c.getLedgerEntriesAttempt(ctx, keysToFetch)
 		if err == nil {
@@ -1272,9 +1326,26 @@ type SimulateTransactionResponse struct {
 
 // SimulateTransaction calls Soroban RPC simulateTransaction using a base64 TransactionEnvelope XDR.
 func (c *Client) SimulateTransaction(ctx context.Context, envelopeXdr string) (*SimulateTransactionResponse, error) {
+	// Apply global timeout if configured
+	if c.Config.TotalTimeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, c.Config.TotalTimeout)
+		defer cancel()
+	}
+
 	attempts := c.endpointAttempts()
 	var failures []NodeFailure
 	for attempt := 0; attempt < attempts; attempt++ {
+		// Check if context has been cancelled or timed out
+		select {
+		case <-ctx.Done():
+			if ctx.Err() == context.DeadlineExceeded {
+				return nil, errors.WrapRPCTimeout(fmt.Errorf("global timeout exceeded after %v", c.Config.TotalTimeout))
+			}
+			return nil, ctx.Err()
+		default:
+		}
+
 		resp, err := c.simulateTransactionAttempt(ctx, envelopeXdr)
 		if err == nil {
 			c.markSorobanSuccess()
@@ -1384,9 +1455,26 @@ func (c *Client) simulateTransactionAttempt(ctx context.Context, envelopeXdr str
 
 // GetHealth checks the health of the Soroban RPC endpoint.
 func (c *Client) GetHealth(ctx context.Context) (*GetHealthResponse, error) {
+	// Apply global timeout if configured
+	if c.Config.TotalTimeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, c.Config.TotalTimeout)
+		defer cancel()
+	}
+
 	attempts := c.endpointAttempts()
 	var failures []NodeFailure
 	for attempt := 0; attempt < attempts; attempt++ {
+		// Check if context has been cancelled or timed out
+		select {
+		case <-ctx.Done():
+			if ctx.Err() == context.DeadlineExceeded {
+				return nil, errors.WrapRPCTimeout(fmt.Errorf("global timeout exceeded after %v", c.Config.TotalTimeout))
+			}
+			return nil, ctx.Err()
+		default:
+		}
+
 		resp, err := c.getHealthAttempt(ctx)
 		if err == nil {
 			c.markSorobanSuccess()