config/manager.go at main · 100mslive/config · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
package config

import (
	"context"
	"errors"
	"fmt"
	"log/slog"
	"math/rand/v2"
	"sync"
	"sync/atomic"
	"time"

	"github.com/rbaliyan/config/codec"
)

// Manager manages configuration across namespaces.
//
// The Manager wraps a Store with caching and automatic cache invalidation.
// It provides access to namespaced configuration via the Namespace method.
//
// Example:
//
//	// Create manager
//	mgr := config.New(
//	    config.WithStore(memory.NewStore()),
//	)
//
//	// Connect to backend
//	if err := mgr.Connect(ctx); err != nil {
//	    return err
//	}
//	defer mgr.Close(ctx)
//
//	// Get configuration for a namespace (use "" for default)
//	prodConfig := mgr.Namespace("production")
//
//	// Use Reader interface in application code
//	val, err := prodConfig.Get(ctx, "app/database/timeout")
//	if err != nil {
//	    return err
//	}
//	var timeout int
//	if err := val.Unmarshal(&timeout); err != nil {
//	    return err
//	}
//
//	// Use Writer interface for management
//	if err := prodConfig.Set(ctx, "app/database/timeout", 30); err != nil {
//	    return err
//	}
type Manager interface {
	// Connect establishes connection to the backend and starts watching.
	// Must be called before any other operations.
	Connect(ctx context.Context) error

	// Close stops watching and releases resources.
	Close(ctx context.Context) error

	// Namespace returns a Config for the specified namespace.
	// Use "" for the default namespace.
	Namespace(name string) Config

	// Refresh forces a cache refresh for a specific key.
	Refresh(ctx context.Context, namespace, key string) error

	// Health performs a health check on the manager and underlying store.
	// Returns nil if healthy, or an error describing the issue.
	// Includes watch status - returns an error if watch has consecutive failures.
	Health(ctx context.Context) error
}

// ManagerObserver provides observability into the Manager's internal state.
// Use a type assertion to access these methods:
//
//	if obs, ok := mgr.(config.ManagerObserver); ok {
//	    stats := obs.CacheStats()
//	    status := obs.WatchStatus()
//	}
type ManagerObserver interface {
	// CacheStats returns statistics about the internal cache.
	CacheStats() CacheStats

	// WatchStatus returns the current status of the watch connection.
	WatchStatus() WatchStatus
}

// WatchStatus provides observability into the watch connection state.
// Applications can use this to monitor watch health and implement
// their own alerting or circuit breaking logic if needed.
type WatchStatus struct {
	// Connected indicates if the manager is connected to the store.
	Connected bool `json:"connected"`

	// ConsecutiveFailures is the number of consecutive watch failures.
	// Resets to 0 when watch successfully connects.
	ConsecutiveFailures int32 `json:"consecutive_failures"`

	// LastError is the most recent watch error message (empty if no error).
	LastError string `json:"last_error,omitempty"`

	// LastAttempt is when the last watch connection was attempted.
	LastAttempt time.Time `json:"last_attempt,omitempty"`

	// Cache contains cache statistics for correlation with watch health.
	Cache CacheStats `json:"cache"`
}

// manager is the default Manager implementation.
type manager struct {
	connectMu sync.Mutex // serializes Connect and Close
	status    int32      // 0=created, 1=connected, 2=closed
	store     Store
	cache     cache // internal cache for resilience
	codec     codec.Codec
	logger    *slog.Logger

	watchCancel context.CancelFunc
	watchWg     sync.WaitGroup

	// Watch backoff configuration and status
	watchBackoff  watchBackoffConfig
	watchFailures atomic.Int32           // consecutive failures for observability
	lastWatchErr  atomic.Pointer[string] // last watch error message (nil = no error)
	lastWatchTime atomic.Int64           // unix timestamp of last watch attempt

	// Config cache
	configMu sync.RWMutex
	configs  map[string]*nsConfig

	maxKeysPerNS int // 0 = unlimited
}

// Compile-time interface checks
var (
	_ Manager         = (*manager)(nil)
	_ ManagerObserver = (*manager)(nil)
)

// New creates a new configuration Manager.
//
// The manager is created but not connected. Call Connect() before use.
// This follows the New/Connect split pattern for better error handling.
//
// The manager always maintains an internal cache for resilience. If the backend
// store becomes temporarily unavailable, cached values will continue to be served.
// This ensures your application keeps working during database outages.
//
// Returns an error if cache initialization fails.
func New(opts ...Option) (Manager, error) {
	o := newManagerOptions()
	for _, opt := range opts {
		opt(o)
	}

	// Create internal cache for resilience (bounded, no expiration)
	cache, err := newMemoryCache(0) // Use default capacity
	if err != nil {
		return nil, fmt.Errorf("failed to initialize cache: %w", err)
	}

	m := &manager{
		status:       0,
		store:        o.store,
		codec:        o.codec,
		logger:       o.logger.With("component", "config"),
		watchBackoff: o.watchBackoff,
		configs:        make(map[string]*nsConfig),
		cache:          cache,
		maxKeysPerNS:   o.maxKeysPerNS,
	}

	return m, nil
}

// Connect establishes connection to the backend and starts watching.
//
// The provided context is used only for the initial store connection.
// The watch goroutine runs independently with its own context until Close() is called.
// This is intentional: the watch should continue running even if the Connect context
// times out, as the watch is a long-running background operation.
func (m *manager) Connect(ctx context.Context) error {
	if m.store == nil {
		return ErrStoreNotConnected
	}

	m.connectMu.Lock()
	defer m.connectMu.Unlock()

	// Only connect from "created" state (0). Reject if already connected (1) or closed (2).
	status := atomic.LoadInt32(&m.status)
	if status == 2 {
		return ErrManagerClosed
	}
	if status == 1 {
		return nil // Already connected
	}

	// Connect to store (uses caller's context for connection timeout).
	// Status remains 0 during this call so concurrent operations correctly
	// see the manager as not yet connected.
	if err := m.store.Connect(ctx); err != nil {
		return err
	}

	// Mark as connected only after store.Connect succeeds
	atomic.StoreInt32(&m.status, 1)

	// Start watching for changes (for internal cache invalidation)
	// Uses independent context - watch should run until Close(), not until Connect's context expires
	// This uses the store's native change stream (MongoDB Change Streams, PostgreSQL LISTEN/NOTIFY)
	watchCtx, cancel := context.WithCancel(context.Background())
	m.watchCancel = cancel

	m.watchWg.Add(1)
	go m.watchChanges(watchCtx)

	m.logger.Info("config manager connected")
	return nil
}

// Close stops watching and releases resources.
func (m *manager) Close(ctx context.Context) error {
	m.connectMu.Lock()
	if !atomic.CompareAndSwapInt32(&m.status, 1, 2) {
		m.connectMu.Unlock()
		return nil // Already closed or not connected
	}
	m.connectMu.Unlock()

	// Stop watching
	if m.watchCancel != nil {
		m.watchCancel()
	}
	m.watchWg.Wait()

	// Close store
	if m.store != nil {
		if err := m.store.Close(ctx); err != nil {
			m.logger.Error("failed to close store", "error", err)
		}
	}

	m.logger.Info("config manager closed")
	return nil
}

func (m *manager) isConnected() bool {
	return atomic.LoadInt32(&m.status) == 1
}

// Namespace returns a Config for the specified namespace.
// Use "" for the default namespace.
func (m *manager) Namespace(name string) Config {
	m.configMu.RLock()
	cfg, ok := m.configs[name]
	m.configMu.RUnlock()

	if ok {
		return cfg
	}

	// Create new config for namespace
	m.configMu.Lock()
	defer m.configMu.Unlock()

	// Double-check after acquiring write lock
	if cfg, ok = m.configs[name]; ok {
		return cfg
	}

	cfg = &nsConfig{
		namespace: name,
		manager:   m,
	}
	m.configs[name] = cfg

	return cfg
}

// Refresh forces a cache refresh for a specific key.
// It fetches the latest value from the store and updates the cache.
func (m *manager) Refresh(ctx context.Context, namespace, key string) error {
	if !m.isConnected() {
		return ErrManagerClosed
	}

	// Fetch fresh data from store
	value, err := m.store.Get(ctx, namespace, key)
	if err != nil {
		// If key not found, remove from cache
		if IsNotFound(err) && m.cache != nil {
			_ = m.cache.Delete(ctx, namespace, key)
		}
		return err
	}

	// Update cache with fresh data
	if m.cache != nil {
		if err := m.cache.Set(ctx, namespace, key, value); err != nil {
			m.logger.Warn("failed to update cache during refresh", "key", key, "error", err)
		}
	}

	return nil
}

// Health performs a health check on the manager and underlying store.
// Returns an error if the manager is closed, the store is unhealthy,
// or if watch has experienced multiple consecutive failures.
func (m *manager) Health(ctx context.Context) error {
	if !m.isConnected() {
		return ErrManagerClosed
	}

	// Check watch status - report unhealthy if multiple consecutive failures
	failures := m.watchFailures.Load()
	if failures >= 3 {
		lastErr := ""
		if p := m.lastWatchErr.Load(); p != nil {
			lastErr = *p
		}
		return &WatchHealthError{ConsecutiveFailures: failures, LastError: lastErr}
	}

	// Check if store supports health checks
	if hc, ok := m.store.(HealthChecker); ok {
		return hc.Health(ctx)
	}

	// No health checker available, assume healthy if connected
	return nil
}

// CacheStats returns statistics about the internal cache.
func (m *manager) CacheStats() CacheStats {
	if m.cache != nil {
		return m.cache.Stats()
	}
	return CacheStats{}
}

// WatchStatus returns the current status of the watch connection.
func (m *manager) WatchStatus() WatchStatus {
	status := WatchStatus{
		Connected:           m.isConnected(),
		ConsecutiveFailures: m.watchFailures.Load(),
		Cache:               m.CacheStats(),
	}

	if p := m.lastWatchErr.Load(); p != nil {
		status.LastError = *p
	}

	if ts := m.lastWatchTime.Load(); ts > 0 {
		status.LastAttempt = time.Unix(ts, 0)
	}

	return status
}

// watchChanges handles cache invalidation based on store's change stream.
// The store provides the change stream (e.g., MongoDB change streams, PostgreSQL LISTEN/NOTIFY).
// Uses exponential backoff for reconnection - the internal cache provides resilience
// during backend unavailability, so aggressive circuit breaking is not needed.
func (m *manager) watchChanges(ctx context.Context) {
	defer m.watchWg.Done()

	cfg := m.watchBackoff
	backoff := cfg.initialBackoff

	for {
		// Check if context is cancelled before attempting to watch
		select {
		case <-ctx.Done():
			return
		default:
		}

		m.lastWatchTime.Store(time.Now().Unix())
		changes, err := m.store.Watch(ctx, WatchFilter{})
		if err != nil {
			if errors.Is(err, ErrWatchNotSupported) {
				// Store doesn't support watching, exit without retry
				m.logger.Debug("store does not support watching")
				return
			}

			failures := m.watchFailures.Add(1)
			errMsg := err.Error()
			m.lastWatchErr.Store(&errMsg)
			m.logger.Warn("failed to start watching",
				"error", err,
				"backoff", backoff,
				"consecutive_failures", failures)

			// Wait with backoff before retrying
			select {
			case <-ctx.Done():
				return
			case <-time.After(jitteredBackoff(backoff)):
			}

			// Increase backoff for next retry (with cap)
			backoff = min(time.Duration(float64(backoff)*cfg.backoffFactor), cfg.maxBackoff)
			continue
		}

		// Successfully connected, reset failures and backoff
		m.watchFailures.Store(0)
		m.lastWatchErr.Store(nil)
		backoff = cfg.initialBackoff
		m.logger.Debug("watch started successfully")

		// Process changes until channel closes or context cancelled
		channelClosed := m.processWatchEvents(ctx, changes)

		if !channelClosed {
			// Context was cancelled, exit
			return
		}

		// Channel closed unexpectedly, log and retry
		m.logger.Warn("watch channel closed, will reconnect", "backoff", backoff)

		select {
		case <-ctx.Done():
			return
		case <-time.After(jitteredBackoff(backoff)):
		}

		backoff = min(time.Duration(float64(backoff)*cfg.backoffFactor), cfg.maxBackoff)
	}
}

// processWatchEvents handles incoming change events.
// Returns true if the channel was closed, false if context was cancelled.
func (m *manager) processWatchEvents(ctx context.Context, changes <-chan ChangeEvent) bool {
	for {
		select {
		case <-ctx.Done():
			return false
		case change, ok := <-changes:
			if !ok {
				return true // Channel closed
			}
			m.handleChange(ctx, change)
		}
	}
}

// handleChange processes a change event and updates the cache.
// This function is safe to call even during shutdown - cache operations
// are protected by the cache's internal lock, and we gracefully handle
// the case where the manager is closing.
func (m *manager) handleChange(ctx context.Context, change ChangeEvent) {
	// Early return if manager is closing or closed
	if !m.isConnected() {
		return
	}

	// Cache reference - capture once to avoid race with Close()
	cache := m.cache
	if cache == nil {
		return
	}

	switch change.Type {
	case ChangeTypeSet:
		if change.Value != nil {
			if err := cache.Set(ctx, change.Namespace, change.Key, change.Value); err != nil {
				// Only log if still connected (avoid spurious errors during shutdown)
				if m.isConnected() {
					m.logger.Warn("failed to update cache", "namespace", change.Namespace, "key", change.Key, "error", err)
				}
			}
		}
	case ChangeTypeDelete:
		if err := cache.Delete(ctx, change.Namespace, change.Key); err != nil {
			// Only log if still connected (avoid spurious errors during shutdown)
			if m.isConnected() {
				m.logger.Warn("failed to invalidate cache", "namespace", change.Namespace, "key", change.Key, "error", err)
			}
		}
	}
}

// jitteredBackoff adds jitter to a backoff duration.
// Returns a duration in the range [0.5*d, 1.5*d).
func jitteredBackoff(d time.Duration) time.Duration {
	return time.Duration(float64(d) * (0.5 + rand.Float64())) // #nosec G404 -- jitter does not require cryptographic randomness
}